diff --git "a/logs/logs/xp3capmixnewcodelonglossseq/main_log.txt" "b/logs/logs/xp3capmixnewcodelonglossseq/main_log.txt" new file mode 100644--- /dev/null +++ "b/logs/logs/xp3capmixnewcodelonglossseq/main_log.txt" @@ -0,0 +1,78848 @@ +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default5]:Traceback (most recent call last): +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default2]:Traceback (most recent call last): +[default1]:Traceback (most recent call last): +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: main() +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default2]:Traceback (most recent call last): +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]: _build_num_microbatches_calculator(args) +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default5]:Traceback (most recent call last): +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]:Traceback (most recent call last): +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: pretrain( +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default2]:Traceback (most recent call last): +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default0]:using world size: 96, data-parallel-size: 48, tensor-model-parallel size: 1, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 48 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2075397.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 6144 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 1536 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 96 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 1 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 96 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default0]:Traceback (most recent call last): +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]:Traceback (most recent call last): +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default2]:Traceback (most recent call last): +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default7]:Traceback (most recent call last): +[default6]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]:Traceback (most recent call last): +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]: _build_num_microbatches_calculator(args) +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: pretrain( +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _build_num_microbatches_calculator(args) +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default5]:Traceback (most recent call last): +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]:Traceback (most recent call last): +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default7]:Traceback (most recent call last): +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: pretrain( +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: pretrain( +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1753058) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3914692) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1093434) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 825823) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3525606) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3438391) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 74446) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1066728) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3725715) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 880478) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2796214) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2861080) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + main() + raise ChildFailedError( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam22-ib0 + rank : 25 (local_rank: 1) + exitcode : 1 (pid: 3725716) + error_file: /tmp/torchelastic_8rd27p5m/none_azdg255a/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam22-ib0 + rank : 26 (local_rank: 2) + exitcode : 1 (pid: 3725717) + error_file: /tmp/torchelastic_8rd27p5m/none_azdg255a/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam22-ib0 + rank : 27 (local_rank: 3) + exitcode : 1 (pid: 3725718) + error_file: /tmp/torchelastic_8rd27p5m/none_azdg255a/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam22-ib0 + rank : 28 (local_rank: 4) + exitcode : 1 (pid: 3725719) + error_file: /tmp/torchelastic_8rd27p5m/none_azdg255a/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam22-ib0 + rank : 29 (local_rank: 5) + exitcode : 1 (pid: 3725720) + error_file: /tmp/torchelastic_8rd27p5m/none_azdg255a/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam22-ib0 + rank : 30 (local_rank: 6) + exitcode : 1 (pid: 3725721) + error_file: /tmp/torchelastic_8rd27p5m/none_azdg255a/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam22-ib0 + rank : 31 (local_rank: 7) + exitcode : 1 (pid: 3725722) + error_file: /tmp/torchelastic_8rd27p5m/none_azdg255a/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam22-ib0 + rank : 24 (local_rank: 0) + exitcode : 1 (pid: 3725715) + error_file: /tmp/torchelastic_8rd27p5m/none_azdg255a/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return _run_code(code, main_globals, None, + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return _run_code(code, main_globals, None, + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + exec(code, run_globals) + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + exec(code, run_globals) + raise ChildFailedError( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam12-ib0 + rank : 17 (local_rank: 1) + exitcode : 1 (pid: 1093435) + error_file: /tmp/torchelastic_nigi8cio/none_zn5k9_va/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam12-ib0 + rank : 18 (local_rank: 2) + exitcode : 1 (pid: 1093436) + error_file: /tmp/torchelastic_nigi8cio/none_zn5k9_va/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam12-ib0 + rank : 19 (local_rank: 3) + exitcode : 1 (pid: 1093437) + error_file: /tmp/torchelastic_nigi8cio/none_zn5k9_va/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam12-ib0 + rank : 20 (local_rank: 4) + exitcode : 1 (pid: 1093438) + error_file: /tmp/torchelastic_nigi8cio/none_zn5k9_va/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + main() + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam12-ib0 + rank : 21 (local_rank: 5) + exitcode : 1 (pid: 1093439) + main() + error_file: /tmp/torchelastic_nigi8cio/none_zn5k9_va/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam12-ib0 + rank : 22 (local_rank: 6) + exitcode : 1 (pid: 1093440) + error_file: /tmp/torchelastic_nigi8cio/none_zn5k9_va/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam12-ib0 + rank : 23 (local_rank: 7) + exitcode : 1 (pid: 1093443) + error_file: /tmp/torchelastic_nigi8cio/none_zn5k9_va/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam12-ib0 + rank : 16 (local_rank: 0) + exitcode : 1 (pid: 1093434) + error_file: /tmp/torchelastic_nigi8cio/none_zn5k9_va/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + raise ChildFailedError( + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam25-ib0 + rank : 49 (local_rank: 1) + exitcode : 1 (pid: 74447) + error_file: /tmp/torchelastic_t4l4wl4t/none_8605rg8n/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + raise ChildFailedError( + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam25-ib0 + rank : 50 (local_rank: 2) + exitcode : 1 (pid: 74448) + error_file: /tmp/torchelastic_t4l4wl4t/none_8605rg8n/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + raise ChildFailedError( + main() + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam29-ib0 + rank : 73 (local_rank: 1) + exitcode : 1 (pid: 1753059) + error_file: /tmp/torchelastic_qrqbf5ft/none_i8mb4q_9/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam25-ib0 + rank : 51 (local_rank: 3) + exitcode : 1 (pid: 74449) + error_file: /tmp/torchelastic_t4l4wl4t/none_8605rg8n/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam25-ib0 + rank : 52 (local_rank: 4) + exitcode : 1 (pid: 74450) + error_file: /tmp/torchelastic_t4l4wl4t/none_8605rg8n/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam25-ib0 + rank : 53 (local_rank: 5) + exitcode : 1 (pid: 74451) + error_file: /tmp/torchelastic_t4l4wl4t/none_8605rg8n/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam29-ib0 + rank : 74 (local_rank: 2) + exitcode : 1 (pid: 1753060) + error_file: /tmp/torchelastic_qrqbf5ft/none_i8mb4q_9/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam25-ib0 + rank : 54 (local_rank: 6) + exitcode : 1 (pid: 74452) + error_file: /tmp/torchelastic_t4l4wl4t/none_8605rg8n/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + return f(*args, **kwargs) +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam23-ib0 + rank : 33 (local_rank: 1) + exitcode : 1 (pid: 2861081) + error_file: /tmp/torchelastic_tpprhzei/none_w5cx3ii_/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam25-ib0 + rank : 55 (local_rank: 7) + exitcode : 1 (pid: 74453) + error_file: /tmp/torchelastic_t4l4wl4t/none_8605rg8n/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam29-ib0 + rank : 75 (local_rank: 3) + exitcode : 1 (pid: 1753061) + error_file: /tmp/torchelastic_qrqbf5ft/none_i8mb4q_9/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam25-ib0 + rank : 48 (local_rank: 0) + exitcode : 1 (pid: 74446) + error_file: /tmp/torchelastic_t4l4wl4t/none_8605rg8n/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam29-ib0 + rank : 76 (local_rank: 4) + exitcode : 1 (pid: 1753062) + error_file: /tmp/torchelastic_qrqbf5ft/none_i8mb4q_9/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam23-ib0 + rank : 34 (local_rank: 2) + exitcode : 1 (pid: 2861082) + error_file: /tmp/torchelastic_tpprhzei/none_w5cx3ii_/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam29-ib0 + rank : 77 (local_rank: 5) + exitcode : 1 (pid: 1753063) + error_file: /tmp/torchelastic_qrqbf5ft/none_i8mb4q_9/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam29-ib0 + rank : 78 (local_rank: 6) + exitcode : 1 (pid: 1753064) + error_file: /tmp/torchelastic_qrqbf5ft/none_i8mb4q_9/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam23-ib0 + rank : 35 (local_rank: 3) + exitcode : 1 (pid: 2861083) + error_file: /tmp/torchelastic_tpprhzei/none_w5cx3ii_/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam29-ib0 + rank : 79 (local_rank: 7) + exitcode : 1 (pid: 1753065) + error_file: /tmp/torchelastic_qrqbf5ft/none_i8mb4q_9/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam23-ib0 + rank : 36 (local_rank: 4) + exitcode : 1 (pid: 2861084) + error_file: /tmp/torchelastic_tpprhzei/none_w5cx3ii_/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam29-ib0 + rank : 72 (local_rank: 0) + exitcode : 1 (pid: 1753058) + error_file: /tmp/torchelastic_qrqbf5ft/none_i8mb4q_9/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam23-ib0 + rank : 37 (local_rank: 5) + exitcode : 1 (pid: 2861085) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + error_file: /tmp/torchelastic_tpprhzei/none_w5cx3ii_/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam23-ib0 + rank : 38 (local_rank: 6) + exitcode : 1 (pid: 2861086) + error_file: /tmp/torchelastic_tpprhzei/none_w5cx3ii_/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam23-ib0 + rank : 39 (local_rank: 7) + exitcode : 1 (pid: 2861087) + error_file: /tmp/torchelastic_tpprhzei/none_w5cx3ii_/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam23-ib0 + rank : 32 (local_rank: 0) + exitcode : 1 (pid: 2861080) + error_file: /tmp/torchelastic_tpprhzei/none_w5cx3ii_/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + raise ChildFailedError( + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:50 + host : jean-zay-iam24-ib0 + rank : 41 (local_rank: 1) + exitcode : 1 (pid: 880479) + error_file: /tmp/torchelastic_07knduzs/none_o555zxc_/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam27-ib0 + rank : 57 (local_rank: 1) + exitcode : 1 (pid: 3438392) + error_file: /tmp/torchelastic_0vbcvu1p/none_l_a_z_xm/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:50 + host : jean-zay-iam24-ib0 + rank : 42 (local_rank: 2) + exitcode : 1 (pid: 880480) + error_file: /tmp/torchelastic_07knduzs/none_o555zxc_/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam27-ib0 + rank : 58 (local_rank: 2) + exitcode : 1 (pid: 3438393) + error_file: /tmp/torchelastic_0vbcvu1p/none_l_a_z_xm/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:50 + host : jean-zay-iam24-ib0 + rank : 43 (local_rank: 3) + exitcode : 1 (pid: 880481) + error_file: /tmp/torchelastic_07knduzs/none_o555zxc_/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam27-ib0 + rank : 59 (local_rank: 3) + exitcode : 1 (pid: 3438394) + error_file: /tmp/torchelastic_0vbcvu1p/none_l_a_z_xm/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:50 + host : jean-zay-iam24-ib0 + rank : 44 (local_rank: 4) + exitcode : 1 (pid: 880482) + error_file: /tmp/torchelastic_07knduzs/none_o555zxc_/attempt_0/4/error.json + traceback : Traceback (most recent call last): + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam27-ib0 + rank : 60 (local_rank: 4) + exitcode : 1 (pid: 3438395) + error_file: /tmp/torchelastic_0vbcvu1p/none_l_a_z_xm/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:50 + host : jean-zay-iam24-ib0 + rank : 45 (local_rank: 5) + exitcode : 1 (pid: 880483) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + error_file: /tmp/torchelastic_07knduzs/none_o555zxc_/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam27-ib0 + rank : 61 (local_rank: 5) + exitcode : 1 (pid: 3438396) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + error_file: /tmp/torchelastic_0vbcvu1p/none_l_a_z_xm/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:50 + host : jean-zay-iam24-ib0 + rank : 46 (local_rank: 6) + exitcode : 1 (pid: 880484) + error_file: /tmp/torchelastic_07knduzs/none_o555zxc_/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam27-ib0 + rank : 62 (local_rank: 6) + exitcode : 1 (pid: 3438397) + error_file: /tmp/torchelastic_0vbcvu1p/none_l_a_z_xm/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:50 + host : jean-zay-iam24-ib0 + rank : 47 (local_rank: 7) + exitcode : 1 (pid: 880485) + error_file: /tmp/torchelastic_07knduzs/none_o555zxc_/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam27-ib0 + rank : 63 (local_rank: 7) + exitcode : 1 (pid: 3438398) + error_file: /tmp/torchelastic_0vbcvu1p/none_l_a_z_xm/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:50 + host : jean-zay-iam24-ib0 + rank : 40 (local_rank: 0) + exitcode : 1 (pid: 880478) + error_file: /tmp/torchelastic_07knduzs/none_o555zxc_/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam27-ib0 + rank : 56 (local_rank: 0) + exitcode : 1 (pid: 3438391) + error_file: /tmp/torchelastic_0vbcvu1p/none_l_a_z_xm/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + return _run_code(code, main_globals, None, + return _run_code(code, main_globals, None, + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( + main() + raise ChildFailedError( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam31-ib0 + rank : 81 (local_rank: 1) + exitcode : 1 (pid: 825824) + error_file: /tmp/torchelastic_en5an3mm/none_jx59hwlr/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam10-ib0 + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 2796215) + error_file: /tmp/torchelastic_yacw5z7i/none_s905kneh/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam31-ib0 + rank : 82 (local_rank: 2) + exitcode : 1 (pid: 825825) + error_file: /tmp/torchelastic_en5an3mm/none_jx59hwlr/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam31-ib0 + rank : 83 (local_rank: 3) + exitcode : 1 (pid: 825826) + error_file: /tmp/torchelastic_en5an3mm/none_jx59hwlr/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam10-ib0 + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 2796216) + error_file: /tmp/torchelastic_yacw5z7i/none_s905kneh/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam31-ib0 + rank : 84 (local_rank: 4) + exitcode : 1 (pid: 825827) + error_file: /tmp/torchelastic_en5an3mm/none_jx59hwlr/attempt_0/4/error.json + traceback : Traceback (most recent call last): + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam10-ib0 + rank : 3 (local_rank: 3) + exitcode : 1 (pid: 2796217) + error_file: /tmp/torchelastic_yacw5z7i/none_s905kneh/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam31-ib0 + rank : 85 (local_rank: 5) + exitcode : 1 (pid: 825828) + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + error_file: /tmp/torchelastic_en5an3mm/none_jx59hwlr/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam10-ib0 + rank : 4 (local_rank: 4) + exitcode : 1 (pid: 2796218) + error_file: /tmp/torchelastic_yacw5z7i/none_s905kneh/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam31-ib0 + rank : 86 (local_rank: 6) + exitcode : 1 (pid: 825829) + error_file: /tmp/torchelastic_en5an3mm/none_jx59hwlr/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam10-ib0 + rank : 5 (local_rank: 5) + exitcode : 1 (pid: 2796219) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + error_file: /tmp/torchelastic_yacw5z7i/none_s905kneh/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam31-ib0 + rank : 87 (local_rank: 7) + exitcode : 1 (pid: 825830) + error_file: /tmp/torchelastic_en5an3mm/none_jx59hwlr/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam10-ib0 + rank : 6 (local_rank: 6) + exitcode : 1 (pid: 2796220) + error_file: /tmp/torchelastic_yacw5z7i/none_s905kneh/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam31-ib0 + rank : 80 (local_rank: 0) + exitcode : 1 (pid: 825823) + error_file: /tmp/torchelastic_en5an3mm/none_jx59hwlr/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam10-ib0 + rank : 7 (local_rank: 7) + exitcode : 1 (pid: 2796221) + error_file: /tmp/torchelastic_yacw5z7i/none_s905kneh/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam10-ib0 + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 2796214) + error_file: /tmp/torchelastic_yacw5z7i/none_s905kneh/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam28-ib0 + rank : 65 (local_rank: 1) + exitcode : 1 (pid: 3914693) + error_file: /tmp/torchelastic_zxo4lo24/none_qke9ewra/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam28-ib0 + rank : 66 (local_rank: 2) + exitcode : 1 (pid: 3914694) + error_file: /tmp/torchelastic_zxo4lo24/none_qke9ewra/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam28-ib0 + rank : 67 (local_rank: 3) + exitcode : 1 (pid: 3914695) + error_file: /tmp/torchelastic_zxo4lo24/none_qke9ewra/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam28-ib0 + rank : 68 (local_rank: 4) + exitcode : 1 (pid: 3914696) + error_file: /tmp/torchelastic_zxo4lo24/none_qke9ewra/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam28-ib0 + rank : 69 (local_rank: 5) + exitcode : 1 (pid: 3914697) + error_file: /tmp/torchelastic_zxo4lo24/none_qke9ewra/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam28-ib0 + rank : 70 (local_rank: 6) + exitcode : 1 (pid: 3914698) + error_file: /tmp/torchelastic_zxo4lo24/none_qke9ewra/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam28-ib0 + rank : 71 (local_rank: 7) + exitcode : 1 (pid: 3914699) + error_file: /tmp/torchelastic_zxo4lo24/none_qke9ewra/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam28-ib0 + rank : 64 (local_rank: 0) + exitcode : 1 (pid: 3914692) + error_file: /tmp/torchelastic_zxo4lo24/none_qke9ewra/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam33-ib0 + rank : 89 (local_rank: 1) + exitcode : 1 (pid: 1066729) + error_file: /tmp/torchelastic_2b6hwefb/none_9rvwvd72/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam33-ib0 + rank : 90 (local_rank: 2) + exitcode : 1 (pid: 1066730) + error_file: /tmp/torchelastic_2b6hwefb/none_9rvwvd72/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam33-ib0 + rank : 91 (local_rank: 3) + exitcode : 1 (pid: 1066731) + error_file: /tmp/torchelastic_2b6hwefb/none_9rvwvd72/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam33-ib0 + rank : 92 (local_rank: 4) + exitcode : 1 (pid: 1066732) + error_file: /tmp/torchelastic_2b6hwefb/none_9rvwvd72/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam33-ib0 + rank : 93 (local_rank: 5) + exitcode : 1 (pid: 1066733) + error_file: /tmp/torchelastic_2b6hwefb/none_9rvwvd72/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam33-ib0 + rank : 94 (local_rank: 6) + exitcode : 1 (pid: 1066734) + error_file: /tmp/torchelastic_2b6hwefb/none_9rvwvd72/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam33-ib0 + rank : 95 (local_rank: 7) + exitcode : 1 (pid: 1066735) + error_file: /tmp/torchelastic_2b6hwefb/none_9rvwvd72/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam33-ib0 + rank : 88 (local_rank: 0) + exitcode : 1 (pid: 1066728) + error_file: /tmp/torchelastic_2b6hwefb/none_9rvwvd72/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam11-ib0 + rank : 9 (local_rank: 1) + exitcode : 1 (pid: 3525607) + error_file: /tmp/torchelastic_1r_khusy/none_nsqhdnr4/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[2]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam11-ib0 + rank : 10 (local_rank: 2) + exitcode : 1 (pid: 3525608) + error_file: /tmp/torchelastic_1r_khusy/none_nsqhdnr4/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[3]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam11-ib0 + rank : 11 (local_rank: 3) + exitcode : 1 (pid: 3525609) + error_file: /tmp/torchelastic_1r_khusy/none_nsqhdnr4/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[4]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam11-ib0 + rank : 12 (local_rank: 4) + exitcode : 1 (pid: 3525610) + error_file: /tmp/torchelastic_1r_khusy/none_nsqhdnr4/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[5]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam11-ib0 + rank : 13 (local_rank: 5) + exitcode : 1 (pid: 3525611) + error_file: /tmp/torchelastic_1r_khusy/none_nsqhdnr4/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[6]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam11-ib0 + rank : 14 (local_rank: 6) + exitcode : 1 (pid: 3525612) + error_file: /tmp/torchelastic_1r_khusy/none_nsqhdnr4/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +[7]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam11-ib0 + rank : 15 (local_rank: 7) + exitcode : 1 (pid: 3525613) + error_file: /tmp/torchelastic_1r_khusy/none_nsqhdnr4/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:54:52 + host : jean-zay-iam11-ib0 + rank : 8 (local_rank: 0) + exitcode : 1 (pid: 3525606) + error_file: /tmp/torchelastic_1r_khusy/none_nsqhdnr4/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (48) + +============================================================ +srun: error: jean-zay-iam31: task 10: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2075397.0 +srun: error: jean-zay-iam29: task 9: Exited with exit code 1 +srun: error: jean-zay-iam27: task 7: Exited with exit code 1 +srun: error: jean-zay-iam10: task 0: Exited with exit code 1 +srun: error: jean-zay-iam25: task 6: Exited with exit code 1 +srun: error: jean-zay-iam33: task 11: Exited with exit code 1 +srun: error: jean-zay-iam11: task 1: Exited with exit code 1 +srun: error: jean-zay-iam28: task 8: Exited with exit code 1 +srun: error: jean-zay-iam24: task 5: Exited with exit code 1 +srun: error: jean-zay-iam22: task 3: Exited with exit code 1 +srun: error: jean-zay-iam12: task 2: Exited with exit code 1 +srun: error: jean-zay-iam23: task 4: Exited with exit code 1 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]:Traceback (most recent call last): +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]:Traceback (most recent call last): +[default2]:Traceback (most recent call last): +[default3]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: pretrain( +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: pretrain( +[default2]: return f(*args, **kwargs) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: pretrain( +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default2]:Traceback (most recent call last): +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]:Traceback (most recent call last): +[default7]: pretrain( +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]:Traceback (most recent call last): +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]:Traceback (most recent call last): +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: _build_num_microbatches_calculator(args) +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: pretrain( +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default6]:Traceback (most recent call last): +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]:Traceback (most recent call last): +[default2]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]:Traceback (most recent call last): +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default2]: pretrain( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: _build_num_microbatches_calculator(args) +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]:Traceback (most recent call last): +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]:Traceback (most recent call last): +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]:Traceback (most recent call last): +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]:using world size: 96, data-parallel-size: 24, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 24 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2075406.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 6144 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 1536 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 96 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 96 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]:Traceback (most recent call last): +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default3]: _build_num_microbatches_calculator(args) +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]: return f(*args, **kwargs) +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]:Traceback (most recent call last): +[default7]: pretrain( +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default0]: set_global_variables(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default0]: _build_num_microbatches_calculator(args) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default0]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default0]: num_microbatches_calculator = ConstantNumMicroBatches( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default0]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default0]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default1]: set_global_variables(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default1]: _build_num_microbatches_calculator(args) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default1]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default1]: num_microbatches_calculator = ConstantNumMicroBatches( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default1]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default1]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default4]:Traceback (most recent call last): +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default2]: set_global_variables(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default2]: _build_num_microbatches_calculator(args) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default2]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default2]: num_microbatches_calculator = ConstantNumMicroBatches( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default2]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default2]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: set_global_variables(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default3]: _build_num_microbatches_calculator(args) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default3]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default3]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: set_global_variables(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default4]: _build_num_microbatches_calculator(args) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default3]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default3]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default4]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default4]: num_microbatches_calculator = ConstantNumMicroBatches( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default4]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default4]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default5]: set_global_variables(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default5]: _build_num_microbatches_calculator(args) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default5]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default5]: num_microbatches_calculator = ConstantNumMicroBatches( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default5]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default5]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default7]: set_global_variables(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default7]: _build_num_microbatches_calculator(args) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default7]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default7]: num_microbatches_calculator = ConstantNumMicroBatches( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default7]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default7]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron +[default6]: set_global_variables(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables +[default6]: _build_num_microbatches_calculator(args) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator +[default6]: _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator +[default6]: num_microbatches_calculator = ConstantNumMicroBatches( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ +[default6]: assert global_batch_size % micro_batch_times_data_parallel == 0, \ +[default6]:AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1753320) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1093676) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3438682) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 74758) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2861348) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 826079) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2796579) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 880718) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1066986) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3726028) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3914941) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3525886) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + exec(code, run_globals) + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + return _run_code(code, main_globals, None, + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + main() + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( + elastic_launch( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam10-ib0 + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 2796580) + error_file: /tmp/torchelastic_uhpvfzwi/none_r0dm1zri/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam10-ib0 + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 2796581) + error_file: /tmp/torchelastic_uhpvfzwi/none_r0dm1zri/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam10-ib0 + rank : 3 (local_rank: 3) + exitcode : 1 (pid: 2796582) + error_file: /tmp/torchelastic_uhpvfzwi/none_r0dm1zri/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam10-ib0 + rank : 4 (local_rank: 4) + exitcode : 1 (pid: 2796583) + error_file: /tmp/torchelastic_uhpvfzwi/none_r0dm1zri/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam10-ib0 + rank : 5 (local_rank: 5) + exitcode : 1 (pid: 2796584) + error_file: /tmp/torchelastic_uhpvfzwi/none_r0dm1zri/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + return f(*args, **kwargs) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam10-ib0 + rank : 6 (local_rank: 6) + exitcode : 1 (pid: 2796585) + error_file: /tmp/torchelastic_uhpvfzwi/none_r0dm1zri/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam10-ib0 + rank : 7 (local_rank: 7) + exitcode : 1 (pid: 2796586) + error_file: /tmp/torchelastic_uhpvfzwi/none_r0dm1zri/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam10-ib0 + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 2796579) + error_file: /tmp/torchelastic_uhpvfzwi/none_r0dm1zri/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + return _run_code(code, main_globals, None, + run(args) + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam11-ib0 + rank : 9 (local_rank: 1) + exitcode : 1 (pid: 3525887) + error_file: /tmp/torchelastic_qhvzlum9/none_7ld4__8_/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + raise ChildFailedError( + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam29-ib0 + rank : 73 (local_rank: 1) + exitcode : 1 (pid: 1753321) + error_file: /tmp/torchelastic_j9_dif0q/none_1niwv9s5/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam11-ib0 + rank : 10 (local_rank: 2) + exitcode : 1 (pid: 3525888) + error_file: /tmp/torchelastic_qhvzlum9/none_7ld4__8_/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return launch_agent(self._config, self._entrypoint, list(args)) + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam27-ib0 + rank : 57 (local_rank: 1) + exitcode : 1 (pid: 3438683) + error_file: /tmp/torchelastic_b4arw8_d/none_d3pnlxsz/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam11-ib0 + rank : 11 (local_rank: 3) + exitcode : 1 (pid: 3525889) + error_file: /tmp/torchelastic_qhvzlum9/none_7ld4__8_/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam29-ib0 + rank : 74 (local_rank: 2) + exitcode : 1 (pid: 1753322) + error_file: /tmp/torchelastic_j9_dif0q/none_1niwv9s5/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam11-ib0 + rank : 12 (local_rank: 4) + exitcode : 1 (pid: 3525890) + error_file: /tmp/torchelastic_qhvzlum9/none_7ld4__8_/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam27-ib0 + rank : 58 (local_rank: 2) + exitcode : 1 (pid: 3438684) + error_file: /tmp/torchelastic_b4arw8_d/none_d3pnlxsz/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam11-ib0 + rank : 13 (local_rank: 5) + exitcode : 1 (pid: 3525891) + error_file: /tmp/torchelastic_qhvzlum9/none_7ld4__8_/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + return _run_code(code, main_globals, None, + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + raise ChildFailedError( + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam11-ib0 + rank : 14 (local_rank: 6) + exitcode : 1 (pid: 3525892) + error_file: /tmp/torchelastic_qhvzlum9/none_7ld4__8_/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam29-ib0 + rank : 75 (local_rank: 3) + exitcode : 1 (pid: 1753323) + error_file: /tmp/torchelastic_j9_dif0q/none_1niwv9s5/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + raise ChildFailedError( + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam27-ib0 + rank : 59 (local_rank: 3) + exitcode : 1 (pid: 3438685) + error_file: /tmp/torchelastic_b4arw8_d/none_d3pnlxsz/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam29-ib0 + rank : 76 (local_rank: 4) + exitcode : 1 (pid: 1753324) + error_file: /tmp/torchelastic_j9_dif0q/none_1niwv9s5/attempt_0/4/error.json + traceback : Traceback (most recent call last): + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam11-ib0 + rank : 15 (local_rank: 7) + exitcode : 1 (pid: 3525893) + error_file: /tmp/torchelastic_qhvzlum9/none_7ld4__8_/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam27-ib0 + rank : 60 (local_rank: 4) + exitcode : 1 (pid: 3438686) + error_file: /tmp/torchelastic_b4arw8_d/none_d3pnlxsz/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam29-ib0 + rank : 77 (local_rank: 5) + exitcode : 1 (pid: 1753325) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam11-ib0 + rank : 8 (local_rank: 0) + exitcode : 1 (pid: 3525886) + error_file: /tmp/torchelastic_qhvzlum9/none_7ld4__8_/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + error_file: /tmp/torchelastic_j9_dif0q/none_1niwv9s5/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam27-ib0 + rank : 61 (local_rank: 5) + exitcode : 1 (pid: 3438687) +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam31-ib0 + rank : 81 (local_rank: 1) + exitcode : 1 (pid: 826080) + error_file: /tmp/torchelastic_gnn0lc0b/none_bar1qc6m/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam22-ib0 + rank : 25 (local_rank: 1) + exitcode : 1 (pid: 3726029) + error_file: /tmp/torchelastic_gjqy0twn/none__3cbm882/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + error_file: /tmp/torchelastic_b4arw8_d/none_d3pnlxsz/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam29-ib0 + rank : 78 (local_rank: 6) + exitcode : 1 (pid: 1753326) + error_file: /tmp/torchelastic_j9_dif0q/none_1niwv9s5/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam27-ib0 + rank : 62 (local_rank: 6) + exitcode : 1 (pid: 3438688) + error_file: /tmp/torchelastic_b4arw8_d/none_d3pnlxsz/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam29-ib0 + rank : 79 (local_rank: 7) + exitcode : 1 (pid: 1753327) + error_file: /tmp/torchelastic_j9_dif0q/none_1niwv9s5/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam27-ib0 + rank : 63 (local_rank: 7) + exitcode : 1 (pid: 3438689) + error_file: /tmp/torchelastic_b4arw8_d/none_d3pnlxsz/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam29-ib0 + rank : 72 (local_rank: 0) + exitcode : 1 (pid: 1753320) + error_file: /tmp/torchelastic_j9_dif0q/none_1niwv9s5/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam27-ib0 + rank : 56 (local_rank: 0) + exitcode : 1 (pid: 3438682) + error_file: /tmp/torchelastic_b4arw8_d/none_d3pnlxsz/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam31-ib0 + rank : 82 (local_rank: 2) + exitcode : 1 (pid: 826081) + error_file: /tmp/torchelastic_gnn0lc0b/none_bar1qc6m/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam22-ib0 + rank : 26 (local_rank: 2) + exitcode : 1 (pid: 3726030) + error_file: /tmp/torchelastic_gjqy0twn/none__3cbm882/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam31-ib0 + rank : 83 (local_rank: 3) + exitcode : 1 (pid: 826082) + error_file: /tmp/torchelastic_gnn0lc0b/none_bar1qc6m/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam22-ib0 + rank : 27 (local_rank: 3) + exitcode : 1 (pid: 3726031) + error_file: /tmp/torchelastic_gjqy0twn/none__3cbm882/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam31-ib0 + rank : 84 (local_rank: 4) + exitcode : 1 (pid: 826083) + error_file: /tmp/torchelastic_gnn0lc0b/none_bar1qc6m/attempt_0/4/error.json + traceback : Traceback (most recent call last): + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam22-ib0 + rank : 28 (local_rank: 4) + exitcode : 1 (pid: 3726032) + error_file: /tmp/torchelastic_gjqy0twn/none__3cbm882/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam31-ib0 + rank : 85 (local_rank: 5) + exitcode : 1 (pid: 826084) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + error_file: /tmp/torchelastic_gnn0lc0b/none_bar1qc6m/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam22-ib0 + rank : 29 (local_rank: 5) + exitcode : 1 (pid: 3726033) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + error_file: /tmp/torchelastic_gjqy0twn/none__3cbm882/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam31-ib0 + rank : 86 (local_rank: 6) + exitcode : 1 (pid: 826085) + error_file: /tmp/torchelastic_gnn0lc0b/none_bar1qc6m/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam22-ib0 + rank : 30 (local_rank: 6) + exitcode : 1 (pid: 3726034) + error_file: /tmp/torchelastic_gjqy0twn/none__3cbm882/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam31-ib0 + rank : 87 (local_rank: 7) + exitcode : 1 (pid: 826086) + error_file: /tmp/torchelastic_gnn0lc0b/none_bar1qc6m/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam22-ib0 + rank : 31 (local_rank: 7) + exitcode : 1 (pid: 3726035) + error_file: /tmp/torchelastic_gjqy0twn/none__3cbm882/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam31-ib0 + rank : 80 (local_rank: 0) + exitcode : 1 (pid: 826079) + error_file: /tmp/torchelastic_gnn0lc0b/none_bar1qc6m/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam22-ib0 + rank : 24 (local_rank: 0) + exitcode : 1 (pid: 3726028) + error_file: /tmp/torchelastic_gjqy0twn/none__3cbm882/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return f(*args, **kwargs) + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return _run_code(code, main_globals, None, + return _run_code(code, main_globals, None, + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + raise ChildFailedError( + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam24-ib0 + rank : 41 (local_rank: 1) + exitcode : 1 (pid: 880719) + error_file: /tmp/torchelastic_lrzo9f4i/none_mqa3ggjc/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + raise ChildFailedError( + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam12-ib0 + rank : 17 (local_rank: 1) + exitcode : 1 (pid: 1093677) + error_file: /tmp/torchelastic_h91i45t7/none_ow5awqc9/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam24-ib0 + rank : 42 (local_rank: 2) + exitcode : 1 (pid: 880720) + error_file: /tmp/torchelastic_lrzo9f4i/none_mqa3ggjc/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam25-ib0 + rank : 49 (local_rank: 1) + exitcode : 1 (pid: 74759) + error_file: /tmp/torchelastic_iin04mz8/none_wcc390ui/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam12-ib0 + rank : 18 (local_rank: 2) + exitcode : 1 (pid: 1093678) + error_file: /tmp/torchelastic_h91i45t7/none_ow5awqc9/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam24-ib0 + rank : 43 (local_rank: 3) + exitcode : 1 (pid: 880721) + error_file: /tmp/torchelastic_lrzo9f4i/none_mqa3ggjc/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam25-ib0 + rank : 50 (local_rank: 2) + exitcode : 1 (pid: 74760) + error_file: /tmp/torchelastic_iin04mz8/none_wcc390ui/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam12-ib0 + rank : 19 (local_rank: 3) + exitcode : 1 (pid: 1093679) + error_file: /tmp/torchelastic_h91i45t7/none_ow5awqc9/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam24-ib0 + rank : 44 (local_rank: 4) + exitcode : 1 (pid: 880722) + error_file: /tmp/torchelastic_lrzo9f4i/none_mqa3ggjc/attempt_0/4/error.json + traceback : Traceback (most recent call last): + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam12-ib0 + rank : 20 (local_rank: 4) + exitcode : 1 (pid: 1093680) + error_file: /tmp/torchelastic_h91i45t7/none_ow5awqc9/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam24-ib0 + rank : 45 (local_rank: 5) + exitcode : 1 (pid: 880723) + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + error_file: /tmp/torchelastic_lrzo9f4i/none_mqa3ggjc/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam12-ib0 + rank : 21 (local_rank: 5) + exitcode : 1 (pid: 1093681) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam25-ib0 + rank : 51 (local_rank: 3) + exitcode : 1 (pid: 74761) + error_file: /tmp/torchelastic_iin04mz8/none_wcc390ui/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + error_file: /tmp/torchelastic_h91i45t7/none_ow5awqc9/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam24-ib0 + rank : 46 (local_rank: 6) + exitcode : 1 (pid: 880724) + error_file: /tmp/torchelastic_lrzo9f4i/none_mqa3ggjc/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam25-ib0 + rank : 52 (local_rank: 4) + exitcode : 1 (pid: 74762) + error_file: /tmp/torchelastic_iin04mz8/none_wcc390ui/attempt_0/4/error.json + traceback : Traceback (most recent call last): + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam12-ib0 + rank : 22 (local_rank: 6) + exitcode : 1 (pid: 1093682) + error_file: /tmp/torchelastic_h91i45t7/none_ow5awqc9/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam24-ib0 + rank : 47 (local_rank: 7) + exitcode : 1 (pid: 880725) + error_file: /tmp/torchelastic_lrzo9f4i/none_mqa3ggjc/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam25-ib0 + rank : 53 (local_rank: 5) + exitcode : 1 (pid: 74763) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + error_file: /tmp/torchelastic_iin04mz8/none_wcc390ui/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam12-ib0 + rank : 23 (local_rank: 7) + exitcode : 1 (pid: 1093683) + error_file: /tmp/torchelastic_h91i45t7/none_ow5awqc9/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam24-ib0 + rank : 40 (local_rank: 0) + exitcode : 1 (pid: 880718) + error_file: /tmp/torchelastic_lrzo9f4i/none_mqa3ggjc/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam25-ib0 + rank : 54 (local_rank: 6) + exitcode : 1 (pid: 74764) + error_file: /tmp/torchelastic_iin04mz8/none_wcc390ui/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam12-ib0 + rank : 16 (local_rank: 0) + exitcode : 1 (pid: 1093676) + error_file: /tmp/torchelastic_h91i45t7/none_ow5awqc9/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam25-ib0 + rank : 55 (local_rank: 7) + exitcode : 1 (pid: 74765) + error_file: /tmp/torchelastic_iin04mz8/none_wcc390ui/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:55 + host : jean-zay-iam25-ib0 + rank : 48 (local_rank: 0) + exitcode : 1 (pid: 74758) + error_file: /tmp/torchelastic_iin04mz8/none_wcc390ui/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam28-ib0 + rank : 65 (local_rank: 1) + exitcode : 1 (pid: 3914942) + error_file: /tmp/torchelastic_8mowprow/none__qpp4zdo/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam23-ib0 + rank : 33 (local_rank: 1) + exitcode : 1 (pid: 2861349) + error_file: /tmp/torchelastic_5wfmct97/none_dxdfqgc8/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam28-ib0 + rank : 66 (local_rank: 2) + exitcode : 1 (pid: 3914943) + error_file: /tmp/torchelastic_8mowprow/none__qpp4zdo/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam28-ib0 + rank : 67 (local_rank: 3) + exitcode : 1 (pid: 3914944) + error_file: /tmp/torchelastic_8mowprow/none__qpp4zdo/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam28-ib0 + rank : 68 (local_rank: 4) + exitcode : 1 (pid: 3914945) + error_file: /tmp/torchelastic_8mowprow/none__qpp4zdo/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam28-ib0 + rank : 69 (local_rank: 5) + exitcode : 1 (pid: 3914946) + error_file: /tmp/torchelastic_8mowprow/none__qpp4zdo/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam23-ib0 + rank : 34 (local_rank: 2) + exitcode : 1 (pid: 2861350) + error_file: /tmp/torchelastic_5wfmct97/none_dxdfqgc8/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + main() + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam28-ib0 + rank : 70 (local_rank: 6) + exitcode : 1 (pid: 3914947) + error_file: /tmp/torchelastic_8mowprow/none__qpp4zdo/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam28-ib0 + rank : 71 (local_rank: 7) + exitcode : 1 (pid: 3914948) + error_file: /tmp/torchelastic_8mowprow/none__qpp4zdo/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam23-ib0 + rank : 35 (local_rank: 3) + exitcode : 1 (pid: 2861351) + error_file: /tmp/torchelastic_5wfmct97/none_dxdfqgc8/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam28-ib0 + rank : 64 (local_rank: 0) + exitcode : 1 (pid: 3914941) + error_file: /tmp/torchelastic_8mowprow/none__qpp4zdo/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam23-ib0 + rank : 36 (local_rank: 4) + exitcode : 1 (pid: 2861352) + error_file: /tmp/torchelastic_5wfmct97/none_dxdfqgc8/attempt_0/4/error.json + traceback : Traceback (most recent call last): + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam23-ib0 + rank : 37 (local_rank: 5) + exitcode : 1 (pid: 2861353) + error_file: /tmp/torchelastic_5wfmct97/none_dxdfqgc8/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam23-ib0 + rank : 38 (local_rank: 6) + exitcode : 1 (pid: 2861354) + error_file: /tmp/torchelastic_5wfmct97/none_dxdfqgc8/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam23-ib0 + rank : 39 (local_rank: 7) + exitcode : 1 (pid: 2861355) + error_file: /tmp/torchelastic_5wfmct97/none_dxdfqgc8/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam23-ib0 + rank : 32 (local_rank: 0) + exitcode : 1 (pid: 2861348) + error_file: /tmp/torchelastic_5wfmct97/none_dxdfqgc8/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam33-ib0 + rank : 89 (local_rank: 1) + exitcode : 1 (pid: 1066987) + error_file: /tmp/torchelastic_vd78f69_/none_gn1gxau9/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[2]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam33-ib0 + rank : 90 (local_rank: 2) + exitcode : 1 (pid: 1066988) + error_file: /tmp/torchelastic_vd78f69_/none_gn1gxau9/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[3]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam33-ib0 + rank : 91 (local_rank: 3) + exitcode : 1 (pid: 1066989) + error_file: /tmp/torchelastic_vd78f69_/none_gn1gxau9/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[4]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam33-ib0 + rank : 92 (local_rank: 4) + exitcode : 1 (pid: 1066990) + error_file: /tmp/torchelastic_vd78f69_/none_gn1gxau9/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[5]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam33-ib0 + rank : 93 (local_rank: 5) + exitcode : 1 (pid: 1066991) + error_file: /tmp/torchelastic_vd78f69_/none_gn1gxau9/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[6]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam33-ib0 + rank : 94 (local_rank: 6) + exitcode : 1 (pid: 1066992) + error_file: /tmp/torchelastic_vd78f69_/none_gn1gxau9/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +[7]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam33-ib0 + rank : 95 (local_rank: 7) + exitcode : 1 (pid: 1066993) + error_file: /tmp/torchelastic_vd78f69_/none_gn1gxau9/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:55:56 + host : jean-zay-iam33-ib0 + rank : 88 (local_rank: 0) + exitcode : 1 (pid: 1066986) + error_file: /tmp/torchelastic_vd78f69_/none_gn1gxau9/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 87, in initialize_megatron + set_global_variables(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 93, in set_global_variables + _build_num_microbatches_calculator(args) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/global_vars.py", line 119, in _build_num_microbatches_calculator + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 26, in build_num_microbatches_calculator + num_microbatches_calculator = ConstantNumMicroBatches( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/microbatches.py", line 77, in __init__ + assert global_batch_size % micro_batch_times_data_parallel == 0, \ + AssertionError: global batch size (2048) is not divisible by micro batch size (1) times data parallel size (24) + +============================================================ +srun: error: jean-zay-iam24: task 5: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2075406.0 +srun: error: jean-zay-iam25: task 6: Exited with exit code 1 +srun: error: jean-zay-iam29: task 9: Exited with exit code 1 +srun: error: jean-zay-iam33: task 11: Exited with exit code 1 +srun: error: jean-zay-iam12: task 2: Exited with exit code 1 +srun: error: jean-zay-iam27: task 7: Exited with exit code 1 +srun: error: jean-zay-iam11: task 1: Exited with exit code 1 +srun: error: jean-zay-iam28: task 8: Exited with exit code 1 +srun: error: jean-zay-iam22: task 3: Exited with exit code 1 +srun: error: jean-zay-iam10: task 0: Exited with exit code 1 +srun: error: jean-zay-iam31: task 10: Exited with exit code 1 +srun: error: jean-zay-iam23: task 4: Exited with exit code 1 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default7]:> setting tensorboard ... +[default0]:using world size: 64, data-parallel-size: 16, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 16 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2075534.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 6144 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 1536 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 96 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 64 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 128 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-06 13:04:51,005] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default0]:> initializing tensor model parallel with size 2 +[default0]:> initializing pipeline model parallel with size 2 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-06 13:04:55,267] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.076 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 6.022 seconds +[default0]:time to initialize megatron (seconds): -34.633 +[default0]:[after megatron is initialized] datetime: 2022-10-06 13:05:01 +[default0]:building GPT model ... +[default0]:[2022-10-06 13:05:01,414] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-06 13:05:01,414] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-06 13:05:01,415] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.04 GB, percent = 6.6% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=1, model=0): 2, ProcessCoord(pipe=0, data=1, model=1): 3, ProcessCoord(pipe=0, data=2, model=0): 4, ProcessCoord(pipe=0, data=2, model=1): 5, ProcessCoord(pipe=0, data=3, model=0): 6, ProcessCoord(pipe=0, data=3, model=1): 7, ProcessCoord(pipe=0, data=4, model=0): 8, ProcessCoord(pipe=0, data=4, model=1): 9, ProcessCoord(pipe=0, data=5, model=0): 10, ProcessCoord(pipe=0, data=5, model=1): 11, ProcessCoord(pipe=0, data=6, model=0): 12, ProcessCoord(pipe=0, data=6, model=1): 13, ProcessCoord(pipe=0, data=7, model=0): 14, ProcessCoord(pipe=0, data=7, model=1): 15, ProcessCoord(pipe=0, data=8, model=0): 16, ProcessCoord(pipe=0, data=8, model=1): 17, ProcessCoord(pipe=0, data=9, model=0): 18, ProcessCoord(pipe=0, data=9, model=1): 19, ProcessCoord(pipe=0, data=10, model=0): 20, ProcessCoord(pipe=0, data=10, model=1): 21, ProcessCoord(pipe=0, data=11, model=0): 22, ProcessCoord(pipe=0, data=11, model=1): 23, ProcessCoord(pipe=0, data=12, model=0): 24, ProcessCoord(pipe=0, data=12, model=1): 25, ProcessCoord(pipe=0, data=13, model=0): 26, ProcessCoord(pipe=0, data=13, model=1): 27, ProcessCoord(pipe=0, data=14, model=0): 28, ProcessCoord(pipe=0, data=14, model=1): 29, ProcessCoord(pipe=0, data=15, model=0): 30, ProcessCoord(pipe=0, data=15, model=1): 31, ProcessCoord(pipe=1, data=0, model=0): 32, ProcessCoord(pipe=1, data=0, model=1): 33, ProcessCoord(pipe=1, data=1, model=0): 34, ProcessCoord(pipe=1, data=1, model=1): 35, ProcessCoord(pipe=1, data=2, model=0): 36, ProcessCoord(pipe=1, data=2, model=1): 37, ProcessCoord(pipe=1, data=3, model=0): 38, ProcessCoord(pipe=1, data=3, model=1): 39, ProcessCoord(pipe=1, data=4, model=0): 40, ProcessCoord(pipe=1, data=4, model=1): 41, ProcessCoord(pipe=1, data=5, model=0): 42, ProcessCoord(pipe=1, data=5, model=1): 43, ProcessCoord(pipe=1, data=6, model=0): 44, ProcessCoord(pipe=1, data=6, model=1): 45, ProcessCoord(pipe=1, data=7, model=0): 46, ProcessCoord(pipe=1, data=7, model=1): 47, ProcessCoord(pipe=1, data=8, model=0): 48, ProcessCoord(pipe=1, data=8, model=1): 49, ProcessCoord(pipe=1, data=9, model=0): 50, ProcessCoord(pipe=1, data=9, model=1): 51, ProcessCoord(pipe=1, data=10, model=0): 52, ProcessCoord(pipe=1, data=10, model=1): 53, ProcessCoord(pipe=1, data=11, model=0): 54, ProcessCoord(pipe=1, data=11, model=1): 55, ProcessCoord(pipe=1, data=12, model=0): 56, ProcessCoord(pipe=1, data=12, model=1): 57, ProcessCoord(pipe=1, data=13, model=0): 58, ProcessCoord(pipe=1, data=13, model=1): 59, ProcessCoord(pipe=1, data=14, model=0): 60, ProcessCoord(pipe=1, data=14, model=1): 61, ProcessCoord(pipe=1, data=15, model=0): 62, ProcessCoord(pipe=1, data=15, model=1): 63} +[default0]:[2022-10-06 13:05:02,270] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=15 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]:stage=1 layers=16 +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-06 13:05:03,135] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-06 13:05:03,135] [INFO] [utils.py:828:see_memory_usage] MA 0.68 GB Max_MA 0.68 GB CA 0.71 GB Max_CA 1 GB +[default0]:[2022-10-06 13:05:03,135] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.46 GB, percent = 6.7% +[default0]:setting training iterations to 3100 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-06 13:05:03,137] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-06 13:05:04,094] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-06 13:05:04,094] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-06 13:05:04,094] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default0]:[2022-10-06 13:05:04,097] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2022-10-06 13:05:04,097] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[default0]:[2022-10-06 13:05:04,097] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer +[default0]:[2022-10-06 13:05:04,097] [INFO] [stage_1_and_2.py:134:__init__] Reduce bucket size 500000000 +[default0]:[2022-10-06 13:05:04,097] [INFO] [stage_1_and_2.py:135:__init__] Allgather bucket size 500000000 +[default0]:[2022-10-06 13:05:04,097] [INFO] [stage_1_and_2.py:136:__init__] CPU Offload: False +[default0]:[2022-10-06 13:05:04,097] [INFO] [stage_1_and_2.py:137:__init__] Round robin gradient partitioning: False +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Emitting ninja build file /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113/utils/build.ninja... +[default5]:Building extension module utils... +[default5]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.4378488063812256 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.43268513679504395 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.43784642219543457 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.4378504753112793 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.4328937530517578 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.437835693359375 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.46538734436035156 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.4914696216583252 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.46375393867492676 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.46388959884643555 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.497180700302124 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.49535083770751953 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.4659585952758789 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4963531494140625 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.43336033821105957 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.43279528617858887 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.44163990020751953 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.4416210651397705 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.4337272644042969 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.4416530132293701 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.43307995796203613 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.4333505630493164 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.4416189193725586 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.46250104904174805 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.4895472526550293 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4895603656768799 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.48953914642333984 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.4625406265258789 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4926772117614746 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.45929718017578125 seconds +[default5]:ninja: no work to do. +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4393906593322754 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.46283984184265137 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.46659111976623535 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.46645379066467285 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.44792604446411133 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4335014820098877 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.4611971378326416 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.4479060173034668 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.49222540855407715 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.4608042240142822 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.4472367763519287 seconds +[default4]:Loading extension module utils... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.4664421081542969 seconds +[default4]:Time to load utils op: 0.44842004776000977 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.46251869201660156 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.4895608425140381 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.4608430862426758 seconds +[default3]:Loading extension module utils... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.491351842880249 seconds +[default3]:Time to load utils op: 0.49219298362731934 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.44951319694519043 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.4432101249694824 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.43891334533691406 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4417545795440674 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.4495229721069336 seconds +[default2]:Loading extension module utils... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.4430422782897949 seconds +[default2]:Time to load utils op: 0.4495065212249756 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.44956111907958984 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.44778919219970703 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.45679259300231934 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.45750904083251953 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.45753026008605957 seconds +[default6]:Loading extension module utils... +[default3]:Loading extension module utils... +[default5]:Loading extension module utils... +[default6]:Time to load utils op: 0.4575324058532715 seconds +[default3]:Time to load utils op: 0.44780850410461426 seconds +[default5]:Time to load utils op: 0.4478776454925537 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.4483664035797119 seconds +[default6]:Rank: 62 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 63 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 4 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 7 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 5 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 42 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 52 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 49 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 48 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 6 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 53 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 43 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 18 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 58 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 59 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 54 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 55 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 10 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 22 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 8 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 24 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 34 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 41 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 40 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 9 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 23 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 21 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 20 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 19 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 11 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 33 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 12 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 15 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 13 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 29 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 14 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 17 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 26 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 16 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 25 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 27 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 32 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 35 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 37 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 39 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 38 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 28 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 30 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 31 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 36 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 46 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 47 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 61 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 60 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 50 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 51 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 57 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 56 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 3 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 2 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 0 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 1 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0022711753845214844 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0029451847076416016 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0016644001007080078 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.001382589340209961 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.001458883285522461 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.001863718032836914 seconds +[default2]:Time to load utils op: 0.0014789104461669922 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.001445770263671875 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0021657943725585938 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.00119781494140625 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0010962486267089844 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0013744831085205078 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0021965503692626953 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.002420663833618164 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Time to load utils op: 0.002086162567138672 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.001947641372680664 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0019888877868652344 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.002506256103515625 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0027861595153808594 seconds +[default5]:Time to load utils op: 0.0027205944061279297 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.002234935760498047 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.00212860107421875 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0012865066528320312 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0014379024505615234 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0013561248779296875 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0015730857849121094 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0014679431915283203 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0013725757598876953 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0021631717681884766 seconds +[default4]:Rank: 44 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0016863346099853516 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0014622211456298828 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0014071464538574219 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0018663406372070312 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0015134811401367188 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0023801326751708984 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0025167465209960938 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0013666152954101562 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0014896392822265625 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0020885467529296875 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0016875267028808594 seconds +[default5]:Rank: 45 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0018851757049560547 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.002053499221801758 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0016644001007080078 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0011930465698242188 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0013396739959716797 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0016205310821533203 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0016469955444335938 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Time to load utils op: 0.0017168521881103516 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.002779245376586914 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0016961097717285156 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0015742778778076172 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0025446414947509766 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0013625621795654297 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0020532608032226562 seconds +[default0]:Time to load utils op: 0.0014142990112304688 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0008084774017333984 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0012717247009277344 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0008635520935058594 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0013225078582763672 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0008885860443115234 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0009267330169677734 seconds +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0016226768493652344 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0009582042694091797 seconds +[default0]:[2022-10-06 13:05:07,288] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states +[default0]:[2022-10-06 13:05:07,289] [INFO] [utils.py:828:see_memory_usage] MA 0.76 GB Max_MA 0.77 GB CA 1.22 GB Max_CA 1 GB +[default0]:[2022-10-06 13:05:07,289] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.64 GB, percent = 7.1% +[default0]:[2022-10-06 13:05:07,354] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states +[default0]:[2022-10-06 13:05:07,355] [INFO] [utils.py:828:see_memory_usage] MA 0.93 GB Max_MA 1.02 GB CA 1.39 GB Max_CA 1 GB +[default0]:[2022-10-06 13:05:07,355] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.64 GB, percent = 7.1% +[default0]:[2022-10-06 13:05:07,355] [INFO] [stage_1_and_2.py:516:__init__] optimizer state initialized +[default0]:[2022-10-06 13:05:07,383] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer +[default0]:[2022-10-06 13:05:07,383] [INFO] [utils.py:828:see_memory_usage] MA 0.93 GB Max_MA 0.93 GB CA 1.39 GB Max_CA 1 GB +[default0]:[2022-10-06 13:05:07,383] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.64 GB, percent = 7.1% +[default0]:[2022-10-06 13:05:07,384] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2022-10-06 13:05:07,384] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2022-10-06 13:05:07,384] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2022-10-06 13:05:07,384] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2022-10-06 13:05:07,384] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2022-10-06 13:05:07,384] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2022-10-06 13:05:07,384] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2022-10-06 13:05:07,384] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2022-10-06 13:05:07,384] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2022-10-06 13:05:07,384] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] gradient_accumulation_steps .. 128 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] train_batch_size ............. 2048 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 1 +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2022-10-06 13:05:07,385] [INFO] [config.py:991:print] world_size ................... 16 +[default0]:[2022-10-06 13:05:07,386] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2022-10-06 13:05:07,386] [INFO] [config.py:991:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2022-10-06 13:05:07,386] [INFO] [config.py:991:print] zero_enabled ................. True +[default0]:[2022-10-06 13:05:07,386] [INFO] [config.py:991:print] zero_optimization_stage ...... 1 +[default0]:[2022-10-06 13:05:07,386] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 1, +[default0]: "train_batch_size": 2.048000e+03, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 1 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0004589557647705078 seconds +[default0]:[2022-10-06 13:05:07,386] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=128 micro_batch_size=1 +[default0]:[2022-10-06 13:05:07,878] [INFO] [engine.py:145:__init__] RANK=32 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default1]:[2022-10-06 13:05:07,880] [INFO] [engine.py:145:__init__] RANK=33 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default1]:[2022-10-06 13:05:07,878] [INFO] [engine.py:145:__init__] RANK=1 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default0]:[2022-10-06 13:05:07,878] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default0]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:05:08,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:05:08,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:05:08,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:05:08,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:05:08,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:05:08,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:05:08,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:05:08,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:05:08,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:05:08,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:05:08,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:05:08,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:05:08,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:05:08,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:05:08,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:05:08,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:05:08,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:05:08,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:05:08,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:05:08,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:05:08,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:05:08,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:05:08,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:05:08,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:05:08,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:[2022-10-06 13:05:10,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:05:10,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:05:10,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]:[2022-10-06 13:05:10,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]:[2022-10-06 13:05:10,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:05:10,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:Traceback (most recent call last): +[default1]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:[2022-10-06 13:05:10,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:05:10,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]:[2022-10-06 13:05:10,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:05:10,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:[2022-10-06 13:05:10,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:[2022-10-06 13:05:10,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:05:10,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:05:10,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:[2022-10-06 13:05:10,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:05:10,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:[2022-10-06 13:05:10,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:[2022-10-06 13:05:10,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:05:10,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]:[2022-10-06 13:05:10,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:[2022-10-06 13:05:10,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]:[2022-10-06 13:05:10,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:[2022-10-06 13:05:10,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:05:10,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:05:10,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:[2022-10-06 13:05:10,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:05:10,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:05:10,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:[2022-10-06 13:05:10,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:05:10,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:05:10,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:[2022-10-06 13:05:10,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:[2022-10-06 13:05:10,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:05:10,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]:[2022-10-06 13:05:10,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]:[2022-10-06 13:05:10,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:05:10,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:[2022-10-06 13:05:10,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]:[2022-10-06 13:05:10,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]:[2022-10-06 13:05:10,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:[2022-10-06 13:05:10,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:[2022-10-06 13:05:10,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:[2022-10-06 13:05:10,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:05:10,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]:[2022-10-06 13:05:10,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:[2022-10-06 13:05:10,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:[2022-10-06 13:05:10,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:[2022-10-06 13:05:10,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:05:10,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]:[2022-10-06 13:05:10,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]:[2022-10-06 13:05:10,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:[2022-10-06 13:05:11,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:05:11,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:05:11,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:05:11,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:[2022-10-06 13:05:11,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]:[2022-10-06 13:05:11,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:[2022-10-06 13:05:11,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]:[2022-10-06 13:05:11,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:[2022-10-06 13:05:11,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:05:11,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:[2022-10-06 13:05:11,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:05:11,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]:Traceback (most recent call last): +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:[2022-10-06 13:05:11,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 881240 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2797203 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3526428 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3726476 closing signal SIGTERM +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 826587) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1094201) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2861837) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 75207) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 3526429) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 881241) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 2797204) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 3726477) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + raise ChildFailedError( + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam11-ib0 + rank : 10 (local_rank: 2) + exitcode : 1 (pid: 3526430) + error_file: /tmp/torchelastic_8zfw451d/none_3xx1kznc/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + exec(code, run_globals) + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam11-ib0 + rank : 11 (local_rank: 3) + exitcode : 1 (pid: 3526431) + error_file: /tmp/torchelastic_8zfw451d/none_3xx1kznc/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + time : 2022-10-06_13:05:10 + host : jean-zay-iam11-ib0 + rank : 12 (local_rank: 4) + exitcode : 1 (pid: 3526432) + error_file: /tmp/torchelastic_8zfw451d/none_3xx1kznc/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam11-ib0 + rank : 13 (local_rank: 5) + exitcode : 1 (pid: 3526433) + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + error_file: /tmp/torchelastic_8zfw451d/none_3xx1kznc/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam11-ib0 + rank : 14 (local_rank: 6) + exitcode : 1 (pid: 3526434) + error_file: /tmp/torchelastic_8zfw451d/none_3xx1kznc/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam11-ib0 + rank : 15 (local_rank: 7) + exitcode : 1 (pid: 3526435) + error_file: /tmp/torchelastic_8zfw451d/none_3xx1kznc/attempt_0/7/error.json + traceback : Traceback (most recent call last): + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + raise ChildFailedError( + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam11-ib0 + rank : 9 (local_rank: 1) + exitcode : 1 (pid: 3526429) + error_file: /tmp/torchelastic_8zfw451d/none_3xx1kznc/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam22-ib0 + rank : 26 (local_rank: 2) + exitcode : 1 (pid: 3726478) + error_file: /tmp/torchelastic_bupjf241/none_4ra3jw7o/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam12-ib0 + rank : 17 (local_rank: 1) + exitcode : 1 (pid: 1094202) + error_file: /tmp/torchelastic_ykor2ju9/none_ojv3v17i/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam22-ib0 + rank : 27 (local_rank: 3) + exitcode : 1 (pid: 3726479) + error_file: /tmp/torchelastic_bupjf241/none_4ra3jw7o/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam12-ib0 + rank : 18 (local_rank: 2) + exitcode : 1 (pid: 1094203) + error_file: /tmp/torchelastic_ykor2ju9/none_ojv3v17i/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + time : 2022-10-06_13:05:11 + host : jean-zay-iam22-ib0 + rank : 28 (local_rank: 4) + exitcode : 1 (pid: 3726480) + error_file: /tmp/torchelastic_bupjf241/none_4ra3jw7o/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:05:11 + host : jean-zay-iam22-ib0 + rank : 29 (local_rank: 5) + exitcode : 1 (pid: 3726481) + time : 2022-10-06_13:05:10 + host : jean-zay-iam12-ib0 + rank : 19 (local_rank: 3) + exitcode : 1 (pid: 1094204) + error_file: /tmp/torchelastic_ykor2ju9/none_ojv3v17i/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + error_file: /tmp/torchelastic_bupjf241/none_4ra3jw7o/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam12-ib0 + rank : 20 (local_rank: 4) + exitcode : 1 (pid: 1094205) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:05:11 + host : jean-zay-iam22-ib0 + rank : 30 (local_rank: 6) + exitcode : 1 (pid: 3726482) + error_file: /tmp/torchelastic_bupjf241/none_4ra3jw7o/attempt_0/6/error.json + error_file: /tmp/torchelastic_ykor2ju9/none_ojv3v17i/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + traceback : Traceback (most recent call last): + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam12-ib0 + rank : 21 (local_rank: 5) + exitcode : 1 (pid: 1094206) + error_file: /tmp/torchelastic_ykor2ju9/none_ojv3v17i/attempt_0/5/error.json + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + traceback : Traceback (most recent call last): + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:05:11 + host : jean-zay-iam22-ib0 + rank : 31 (local_rank: 7) + exitcode : 1 (pid: 3726483) + error_file: /tmp/torchelastic_bupjf241/none_4ra3jw7o/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam12-ib0 + rank : 22 (local_rank: 6) + exitcode : 1 (pid: 1094207) + error_file: /tmp/torchelastic_ykor2ju9/none_ojv3v17i/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam22-ib0 + rank : 25 (local_rank: 1) + exitcode : 1 (pid: 3726477) + error_file: /tmp/torchelastic_bupjf241/none_4ra3jw7o/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[7]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam12-ib0 + rank : 23 (local_rank: 7) + exitcode : 1 (pid: 1094208) + error_file: /tmp/torchelastic_ykor2ju9/none_ojv3v17i/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam12-ib0 + rank : 16 (local_rank: 0) + exitcode : 1 (pid: 1094201) + error_file: /tmp/torchelastic_ykor2ju9/none_ojv3v17i/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam25-ib0 + rank : 49 (local_rank: 1) + exitcode : 1 (pid: 75208) + error_file: /tmp/torchelastic_up5mmxne/none__zfgf_24/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:05:11 + host : jean-zay-iam10-ib0 + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 2797205) + error_file: /tmp/torchelastic_6w2ah75f/none_tvx12z5y/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam25-ib0 + rank : 50 (local_rank: 2) + exitcode : 1 (pid: 75209) + error_file: /tmp/torchelastic_up5mmxne/none__zfgf_24/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:05:11 + host : jean-zay-iam10-ib0 + rank : 3 (local_rank: 3) + exitcode : 1 (pid: 2797206) + error_file: /tmp/torchelastic_6w2ah75f/none_tvx12z5y/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + time : 2022-10-06_13:05:11 + host : jean-zay-iam10-ib0 + rank : 4 (local_rank: 4) + exitcode : 1 (pid: 2797207) + error_file: /tmp/torchelastic_6w2ah75f/none_tvx12z5y/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:05:11 + host : jean-zay-iam10-ib0 + rank : 5 (local_rank: 5) + exitcode : 1 (pid: 2797208) + raise ChildFailedError( + error_file: /tmp/torchelastic_6w2ah75f/none_tvx12z5y/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:05:11 + host : jean-zay-iam10-ib0 + rank : 6 (local_rank: 6) + exitcode : 1 (pid: 2797209) + error_file: /tmp/torchelastic_6w2ah75f/none_tvx12z5y/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:05:11 + host : jean-zay-iam10-ib0 + rank : 7 (local_rank: 7) + exitcode : 1 (pid: 2797210) + error_file: /tmp/torchelastic_6w2ah75f/none_tvx12z5y/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + time : 2022-10-06_13:05:10 + host : jean-zay-iam25-ib0 + rank : 51 (local_rank: 3) + exitcode : 1 (pid: 75210) + error_file: /tmp/torchelastic_up5mmxne/none__zfgf_24/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:05:11 + host : jean-zay-iam10-ib0 + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 2797204) + error_file: /tmp/torchelastic_6w2ah75f/none_tvx12z5y/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam25-ib0 + rank : 52 (local_rank: 4) + exitcode : 1 (pid: 75211) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + error_file: /tmp/torchelastic_up5mmxne/none__zfgf_24/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam24-ib0 + rank : 42 (local_rank: 2) + exitcode : 1 (pid: 881242) + error_file: /tmp/torchelastic_n7ztc6jc/none_09lfjb6r/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam25-ib0 + rank : 53 (local_rank: 5) + exitcode : 1 (pid: 75212) + error_file: /tmp/torchelastic_up5mmxne/none__zfgf_24/attempt_0/5/error.json + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + main() + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam24-ib0 + rank : 43 (local_rank: 3) + exitcode : 1 (pid: 881243) + error_file: /tmp/torchelastic_n7ztc6jc/none_09lfjb6r/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam24-ib0 + rank : 44 (local_rank: 4) + exitcode : 1 (pid: 881244) + error_file: /tmp/torchelastic_n7ztc6jc/none_09lfjb6r/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam24-ib0 + rank : 45 (local_rank: 5) + exitcode : 1 (pid: 881245) + error_file: /tmp/torchelastic_n7ztc6jc/none_09lfjb6r/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam23-ib0 + rank : 33 (local_rank: 1) + exitcode : 1 (pid: 2861838) + error_file: /tmp/torchelastic_jmkfzx7x/none__td414zp/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam23-ib0 + rank : 34 (local_rank: 2) + exitcode : 1 (pid: 2861839) + error_file: /tmp/torchelastic_jmkfzx7x/none__td414zp/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam23-ib0 + rank : 35 (local_rank: 3) + exitcode : 1 (pid: 2861840) + error_file: /tmp/torchelastic_jmkfzx7x/none__td414zp/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam23-ib0 + rank : 36 (local_rank: 4) + exitcode : 1 (pid: 2861841) + error_file: /tmp/torchelastic_jmkfzx7x/none__td414zp/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam23-ib0 + rank : 37 (local_rank: 5) + exitcode : 1 (pid: 2861842) + error_file: /tmp/torchelastic_jmkfzx7x/none__td414zp/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam23-ib0 + rank : 38 (local_rank: 6) + exitcode : 1 (pid: 2861843) + error_file: /tmp/torchelastic_jmkfzx7x/none__td414zp/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[7]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam23-ib0 + rank : 39 (local_rank: 7) + exitcode : 1 (pid: 2861844) + error_file: /tmp/torchelastic_jmkfzx7x/none__td414zp/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam23-ib0 + rank : 32 (local_rank: 0) + exitcode : 1 (pid: 2861837) + error_file: /tmp/torchelastic_jmkfzx7x/none__td414zp/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam31-ib0 + rank : 57 (local_rank: 1) + exitcode : 1 (pid: 826588) + error_file: /tmp/torchelastic_ahxrb79r/none_u7_0sexf/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam31-ib0 + rank : 58 (local_rank: 2) + exitcode : 1 (pid: 826589) + error_file: /tmp/torchelastic_ahxrb79r/none_u7_0sexf/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam31-ib0 + rank : 59 (local_rank: 3) + exitcode : 1 (pid: 826590) + error_file: /tmp/torchelastic_ahxrb79r/none_u7_0sexf/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam31-ib0 + rank : 60 (local_rank: 4) + exitcode : 1 (pid: 826591) + error_file: /tmp/torchelastic_ahxrb79r/none_u7_0sexf/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam31-ib0 + rank : 61 (local_rank: 5) + exitcode : 1 (pid: 826592) + error_file: /tmp/torchelastic_ahxrb79r/none_u7_0sexf/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam31-ib0 + rank : 62 (local_rank: 6) + exitcode : 1 (pid: 826593) + error_file: /tmp/torchelastic_ahxrb79r/none_u7_0sexf/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[7]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam31-ib0 + rank : 63 (local_rank: 7) + exitcode : 1 (pid: 826594) + error_file: /tmp/torchelastic_ahxrb79r/none_u7_0sexf/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam31-ib0 + rank : 56 (local_rank: 0) + exitcode : 1 (pid: 826587) + error_file: /tmp/torchelastic_ahxrb79r/none_u7_0sexf/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam25-ib0 + rank : 54 (local_rank: 6) + exitcode : 1 (pid: 75213) + error_file: /tmp/torchelastic_up5mmxne/none__zfgf_24/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam24-ib0 + rank : 46 (local_rank: 6) + exitcode : 1 (pid: 881246) + error_file: /tmp/torchelastic_n7ztc6jc/none_09lfjb6r/attempt_0/6/error.json + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + traceback : Traceback (most recent call last): + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[7]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam25-ib0 + rank : 55 (local_rank: 7) + exitcode : 1 (pid: 75214) + error_file: /tmp/torchelastic_up5mmxne/none__zfgf_24/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam24-ib0 + rank : 47 (local_rank: 7) + exitcode : 1 (pid: 881247) + error_file: /tmp/torchelastic_n7ztc6jc/none_09lfjb6r/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam25-ib0 + rank : 48 (local_rank: 0) + exitcode : 1 (pid: 75207) + error_file: /tmp/torchelastic_up5mmxne/none__zfgf_24/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:05:10 + host : jean-zay-iam24-ib0 + rank : 41 (local_rank: 1) + exitcode : 1 (pid: 881241) + error_file: /tmp/torchelastic_n7ztc6jc/none_09lfjb6r/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ +srun: error: jean-zay-iam10: task 0: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2075534.0 +srun: error: jean-zay-iam22: task 3: Exited with exit code 1 +srun: error: jean-zay-iam11: task 1: Exited with exit code 1 +srun: error: jean-zay-iam24: task 5: Exited with exit code 1 +srun: error: jean-zay-iam25: task 6: Exited with exit code 1 +srun: error: jean-zay-iam31: task 7: Exited with exit code 1 +srun: error: jean-zay-iam12: task 2: Exited with exit code 1 +srun: error: jean-zay-iam23: task 4: Exited with exit code 1 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:using world size: 64, data-parallel-size: 16, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 16 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2075637.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 6144 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 1536 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 96 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 64 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 0 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 128 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default7]:> setting tensorboard ... +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-06 13:16:28,644] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default0]:> initializing tensor model parallel with size 2 +[default0]:> initializing pipeline model parallel with size 2 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-06 13:16:30,727] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.093 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 6.952 seconds +[default0]:time to initialize megatron (seconds): 21.774 +[default0]:[after megatron is initialized] datetime: 2022-10-06 13:16:37 +[default0]:building GPT model ... +[default0]:[2022-10-06 13:16:37,823] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-06 13:16:37,824] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-06 13:16:37,824] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.27 GB, percent = 6.6% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=1, model=0): 2, ProcessCoord(pipe=0, data=1, model=1): 3, ProcessCoord(pipe=0, data=2, model=0): 4, ProcessCoord(pipe=0, data=2, model=1): 5, ProcessCoord(pipe=0, data=3, model=0): 6, ProcessCoord(pipe=0, data=3, model=1): 7, ProcessCoord(pipe=0, data=4, model=0): 8, ProcessCoord(pipe=0, data=4, model=1): 9, ProcessCoord(pipe=0, data=5, model=0): 10, ProcessCoord(pipe=0, data=5, model=1): 11, ProcessCoord(pipe=0, data=6, model=0): 12, ProcessCoord(pipe=0, data=6, model=1): 13, ProcessCoord(pipe=0, data=7, model=0): 14, ProcessCoord(pipe=0, data=7, model=1): 15, ProcessCoord(pipe=0, data=8, model=0): 16, ProcessCoord(pipe=0, data=8, model=1): 17, ProcessCoord(pipe=0, data=9, model=0): 18, ProcessCoord(pipe=0, data=9, model=1): 19, ProcessCoord(pipe=0, data=10, model=0): 20, ProcessCoord(pipe=0, data=10, model=1): 21, ProcessCoord(pipe=0, data=11, model=0): 22, ProcessCoord(pipe=0, data=11, model=1): 23, ProcessCoord(pipe=0, data=12, model=0): 24, ProcessCoord(pipe=0, data=12, model=1): 25, ProcessCoord(pipe=0, data=13, model=0): 26, ProcessCoord(pipe=0, data=13, model=1): 27, ProcessCoord(pipe=0, data=14, model=0): 28, ProcessCoord(pipe=0, data=14, model=1): 29, ProcessCoord(pipe=0, data=15, model=0): 30, ProcessCoord(pipe=0, data=15, model=1): 31, ProcessCoord(pipe=1, data=0, model=0): 32, ProcessCoord(pipe=1, data=0, model=1): 33, ProcessCoord(pipe=1, data=1, model=0): 34, ProcessCoord(pipe=1, data=1, model=1): 35, ProcessCoord(pipe=1, data=2, model=0): 36, ProcessCoord(pipe=1, data=2, model=1): 37, ProcessCoord(pipe=1, data=3, model=0): 38, ProcessCoord(pipe=1, data=3, model=1): 39, ProcessCoord(pipe=1, data=4, model=0): 40, ProcessCoord(pipe=1, data=4, model=1): 41, ProcessCoord(pipe=1, data=5, model=0): 42, ProcessCoord(pipe=1, data=5, model=1): 43, ProcessCoord(pipe=1, data=6, model=0): 44, ProcessCoord(pipe=1, data=6, model=1): 45, ProcessCoord(pipe=1, data=7, model=0): 46, ProcessCoord(pipe=1, data=7, model=1): 47, ProcessCoord(pipe=1, data=8, model=0): 48, ProcessCoord(pipe=1, data=8, model=1): 49, ProcessCoord(pipe=1, data=9, model=0): 50, ProcessCoord(pipe=1, data=9, model=1): 51, ProcessCoord(pipe=1, data=10, model=0): 52, ProcessCoord(pipe=1, data=10, model=1): 53, ProcessCoord(pipe=1, data=11, model=0): 54, ProcessCoord(pipe=1, data=11, model=1): 55, ProcessCoord(pipe=1, data=12, model=0): 56, ProcessCoord(pipe=1, data=12, model=1): 57, ProcessCoord(pipe=1, data=13, model=0): 58, ProcessCoord(pipe=1, data=13, model=1): 59, ProcessCoord(pipe=1, data=14, model=0): 60, ProcessCoord(pipe=1, data=14, model=1): 61, ProcessCoord(pipe=1, data=15, model=0): 62, ProcessCoord(pipe=1, data=15, model=1): 63} +[default0]:[2022-10-06 13:16:38,681] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=15 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]:stage=1 layers=16 +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-06 13:16:39,411] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-06 13:16:39,411] [INFO] [utils.py:828:see_memory_usage] MA 0.68 GB Max_MA 0.68 GB CA 0.71 GB Max_CA 1 GB +[default0]:[2022-10-06 13:16:39,411] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.69 GB, percent = 6.7% +[default0]:setting training iterations to 3100 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-06 13:16:39,413] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default0]:[2022-10-06 13:16:40,514] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default0]:[2022-10-06 13:16:40,514] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default0]:[2022-10-06 13:16:40,514] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default0]:[2022-10-06 13:16:40,517] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2022-10-06 13:16:40,517] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 optimizer with dynamic loss scale +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-06 13:16:40,552] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2022-10-06 13:16:40,553] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2022-10-06 13:16:40,553] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2022-10-06 13:16:40,553] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2022-10-06 13:16:40,553] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2022-10-06 13:16:40,553] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2022-10-06 13:16:40,553] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2022-10-06 13:16:40,553] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2022-10-06 13:16:40,553] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2022-10-06 13:16:40,553] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2022-10-06 13:16:40,553] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2022-10-06 13:16:40,553] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2022-10-06 13:16:40,553] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2022-10-06 13:16:40,553] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] gradient_accumulation_steps .. 128 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] train_batch_size ............. 2048 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 1 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] world_size ................... 16 +[default0]:[2022-10-06 13:16:40,554] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2022-10-06 13:16:40,555] [INFO] [config.py:991:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2022-10-06 13:16:40,555] [INFO] [config.py:991:print] zero_enabled ................. False +[default0]:[2022-10-06 13:16:40,555] [INFO] [config.py:991:print] zero_optimization_stage ...... 0 +[default0]:[2022-10-06 13:16:40,555] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 1, +[default0]: "train_batch_size": 2.048000e+03, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 0 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Emitting ninja build file /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113/utils/build.ninja... +[default4]:Building extension module utils... +[default4]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.213667631149292 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.21291184425354004 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.21350955963134766 seconds +[default4]:ninja: no work to do. +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.1994309425354004 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.2129220962524414 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.21739649772644043 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.2171945571899414 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.2172536849975586 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.2952873706817627 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.295673131942749 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.29544901847839355 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.2623584270477295 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.29592251777648926 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.26558566093444824 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.26117634773254395 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.26955747604370117 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.26685595512390137 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.2591888904571533 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.2581915855407715 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.2590303421020508 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.25885677337646484 seconds +[default4]:Loading extension module utils... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.2679152488708496 seconds +[default4]:Time to load utils op: 0.2596418857574463 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.2670021057128906 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.267197847366333 seconds +[default3]:Loading extension module utils... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.27347588539123535 seconds +[default3]:Time to load utils op: 0.27970075607299805 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.2737283706665039 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.2662050724029541 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.2798807621002197 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.27645206451416016 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.27158570289611816 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.2773759365081787 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.2687408924102783 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.268934965133667 seconds +[default7]:Loading extension module utils... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.280911922454834 seconds +[default7]:Time to load utils op: 0.2773780822753906 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.271193265914917 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.2773904800415039 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.282397985458374 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.321852445602417 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.2883000373840332 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.3212733268737793 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.3215298652648926 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.32164692878723145 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.2856471538543701 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.2765936851501465 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.25793910026550293 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.2671937942504883 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.2578446865081787 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.2578907012939453 seconds +[default0]:Loading extension module utils... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.2582111358642578 seconds +[default0]:Time to load utils op: 0.25795698165893555 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.25823116302490234 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.2671666145324707 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.289470911026001 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.2903010845184326 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.2846372127532959 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.33310532569885254 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.289839506149292 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.26601409912109375 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.32926177978515625 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.3299705982208252 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.333920955657959 seconds +[default0]:[2022-10-06 13:16:40,890] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=128 micro_batch_size=1 +[default1]:[2022-10-06 13:16:41,352] [INFO] [engine.py:145:__init__] RANK=33 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default0]:[2022-10-06 13:16:41,351] [INFO] [engine.py:145:__init__] RANK=32 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default1]:[2022-10-06 13:16:41,352] [INFO] [engine.py:145:__init__] RANK=1 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default0]:[2022-10-06 13:16:41,351] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default1]:[2022-10-06 13:16:41,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:16:41,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:16:41,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:16:41,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:16:41,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:16:41,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:16:41,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:16:41,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:16:41,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:16:41,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:16:41,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:16:41,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:16:41,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:16:41,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:16:41,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:16:41,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:16:41,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:16:41,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:16:41,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:16:41,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:16:41,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:16:41,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:16:41,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:16:41,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:16:41,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:16:41,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:16:41,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:16:41,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:16:41,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:16:41,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:16:41,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:16:41,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:16:41,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:16:41,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:16:41,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:16:41,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:16:41,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:16:44,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:16:44,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:16:44,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:[2022-10-06 13:16:44,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:[2022-10-06 13:16:44,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:[2022-10-06 13:16:44,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:16:44,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:16:44,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]:[2022-10-06 13:16:44,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:16:44,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:Traceback (most recent call last): +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]:[2022-10-06 13:16:44,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:[2022-10-06 13:16:44,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:16:44,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:16:44,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:16:44,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]:[2022-10-06 13:16:44,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:[2022-10-06 13:16:44,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:16:44,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:Traceback (most recent call last): +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:[2022-10-06 13:16:44,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:16:44,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:16:44,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:[2022-10-06 13:16:44,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:16:44,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:[2022-10-06 13:16:44,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:[2022-10-06 13:16:44,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:16:44,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]:[2022-10-06 13:16:44,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:16:44,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]:[2022-10-06 13:16:44,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: pretrain( +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:[2022-10-06 13:16:44,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:[2022-10-06 13:16:44,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:Traceback (most recent call last): +[default0]:[2022-10-06 13:16:44,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]:[2022-10-06 13:16:44,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:[2022-10-06 13:16:44,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:16:44,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:16:44,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:16:44,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:16:44,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:[2022-10-06 13:16:44,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:16:44,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:16:44,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]:Traceback (most recent call last): +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]:[2022-10-06 13:16:44,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:16:44,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:[2022-10-06 13:16:44,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]:[2022-10-06 13:16:44,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:[2022-10-06 13:16:44,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:16:44,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:16:44,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:16:44,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:16:44,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:16:44,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:16:44,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:16:44,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default6]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]:[2022-10-06 13:16:44,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:16:44,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:16:44,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default6]:[2022-10-06 13:16:44,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default5]:[2022-10-06 13:16:44,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]:[2022-10-06 13:16:44,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default4]:[2022-10-06 13:16:44,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default5]:[2022-10-06 13:16:44,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:16:44,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:16:44,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]:Traceback (most recent call last): +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default7]:[2022-10-06 13:16:44,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default0]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: main() +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default3]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default3]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: self.check_ckpt_list() +[default7]: return f(*args, **kwargs) +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default4]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default4]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default5]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: self.check_ckpt_list() +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default6]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default6]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list +[default1]: assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" +[default1]:AssertionError: checkpoint count 1 is different from saved mp_world_size 4 +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1754219 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1754220 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1754221 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1754223 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1754225 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1754226 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3527396 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3527397 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3527398 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3439568 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3527400 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3527402 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3439569 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3439570 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3439572 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3439573 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1095168 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3439574 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3439575 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1095169 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1095172 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1095174 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1095175 closing signal SIGTERM +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 2 (pid: 1095170) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 3 (pid: 3527399) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 3 (pid: 1754222) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 3 (pid: 3439571) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2798328) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3915822) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 217426) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 827545) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + return f(*args, **kwargs) + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + return f(*args, **kwargs) + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + raise ChildFailedError( + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam29-ib0 + rank : 53 (local_rank: 5) + exitcode : 1 (pid: 1754224) + error_file: /tmp/torchelastic_jd2fqef8/none_lpkj8trn/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam29-ib0 + rank : 51 (local_rank: 3) + exitcode : 1 (pid: 1754222) + error_file: /tmp/torchelastic_jd2fqef8/none_lpkj8trn/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + return launch_agent(self._config, self._entrypoint, list(args)) +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + raise ChildFailedError( + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam10-ib0 + rank : 9 (local_rank: 1) + exitcode : 1 (pid: 2798329) + error_file: /tmp/torchelastic_jtpo6e92/none_fal7_wk4/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam10-ib0 + rank : 10 (local_rank: 2) + exitcode : 1 (pid: 2798330) + error_file: /tmp/torchelastic_jtpo6e92/none_fal7_wk4/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam10-ib0 + rank : 11 (local_rank: 3) + exitcode : 1 (pid: 2798331) + error_file: /tmp/torchelastic_jtpo6e92/none_fal7_wk4/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + return f(*args, **kwargs) + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam10-ib0 + rank : 12 (local_rank: 4) + exitcode : 1 (pid: 2798332) + error_file: /tmp/torchelastic_jtpo6e92/none_fal7_wk4/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam10-ib0 + rank : 13 (local_rank: 5) + exitcode : 1 (pid: 2798333) + error_file: /tmp/torchelastic_jtpo6e92/none_fal7_wk4/attempt_0/5/error.json + traceback : Traceback (most recent call last): + raise ChildFailedError( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam10-ib0 + rank : 14 (local_rank: 6) + exitcode : 1 (pid: 2798334) + error_file: /tmp/torchelastic_jtpo6e92/none_fal7_wk4/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[7]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam10-ib0 + rank : 15 (local_rank: 7) + exitcode : 1 (pid: 2798335) + error_file: /tmp/torchelastic_jtpo6e92/none_fal7_wk4/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam10-ib0 + rank : 8 (local_rank: 0) + exitcode : 1 (pid: 2798328) + error_file: /tmp/torchelastic_jtpo6e92/none_fal7_wk4/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam11-ib0 + rank : 21 (local_rank: 5) + exitcode : 1 (pid: 3527401) + error_file: /tmp/torchelastic_rczon9pz/none_cw9zv1_d/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam11-ib0 + rank : 23 (local_rank: 7) + exitcode : 1 (pid: 3527403) + error_file: /tmp/torchelastic_rczon9pz/none_cw9zv1_d/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam11-ib0 + rank : 19 (local_rank: 3) + exitcode : 1 (pid: 3527399) + error_file: /tmp/torchelastic_rczon9pz/none_cw9zv1_d/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam27-ib0 + rank : 35 (local_rank: 3) + exitcode : 1 (pid: 3439571) + error_file: /tmp/torchelastic_r0puscx0/none_05tg7cyr/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + raise ChildFailedError( + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam05-ib0 + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 217427) + error_file: /tmp/torchelastic_kj5odao4/none_8y05y31o/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + raise ChildFailedError( + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam05-ib0 + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 217428) + error_file: /tmp/torchelastic_kj5odao4/none_8y05y31o/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam12-ib0 + rank : 27 (local_rank: 3) + exitcode : 1 (pid: 1095171) + error_file: /tmp/torchelastic_vnmq0c49/none_9_pdgldn/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam05-ib0 + rank : 3 (local_rank: 3) + exitcode : 1 (pid: 217429) + error_file: /tmp/torchelastic_kj5odao4/none_8y05y31o/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam05-ib0 + rank : 4 (local_rank: 4) + exitcode : 1 (pid: 217430) + error_file: /tmp/torchelastic_kj5odao4/none_8y05y31o/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam05-ib0 + rank : 5 (local_rank: 5) + exitcode : 1 (pid: 217431) + error_file: /tmp/torchelastic_kj5odao4/none_8y05y31o/attempt_0/5/error.json + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + return _run_code(code, main_globals, None, + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam12-ib0 + rank : 29 (local_rank: 5) + exitcode : 1 (pid: 1095173) + error_file: /tmp/torchelastic_vnmq0c49/none_9_pdgldn/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam05-ib0 + rank : 6 (local_rank: 6) + exitcode : 1 (pid: 217432) + error_file: /tmp/torchelastic_kj5odao4/none_8y05y31o/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam12-ib0 + rank : 26 (local_rank: 2) + exitcode : 1 (pid: 1095170) + error_file: /tmp/torchelastic_vnmq0c49/none_9_pdgldn/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[7]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam05-ib0 + rank : 7 (local_rank: 7) + exitcode : 1 (pid: 217433) + error_file: /tmp/torchelastic_kj5odao4/none_8y05y31o/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( +============================================================ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam05-ib0 + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 217426) + error_file: /tmp/torchelastic_kj5odao4/none_8y05y31o/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam28-ib0 + rank : 41 (local_rank: 1) + exitcode : 1 (pid: 3915823) + error_file: /tmp/torchelastic_pyumatzt/none_8h5y8c77/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam28-ib0 + rank : 42 (local_rank: 2) + exitcode : 1 (pid: 3915824) + error_file: /tmp/torchelastic_pyumatzt/none_8h5y8c77/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam28-ib0 + rank : 43 (local_rank: 3) + exitcode : 1 (pid: 3915825) + error_file: /tmp/torchelastic_pyumatzt/none_8h5y8c77/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam28-ib0 + rank : 44 (local_rank: 4) + exitcode : 1 (pid: 3915826) + error_file: /tmp/torchelastic_pyumatzt/none_8h5y8c77/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam28-ib0 + rank : 45 (local_rank: 5) + exitcode : 1 (pid: 3915827) + error_file: /tmp/torchelastic_pyumatzt/none_8h5y8c77/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam28-ib0 + rank : 46 (local_rank: 6) + exitcode : 1 (pid: 3915828) + error_file: /tmp/torchelastic_pyumatzt/none_8h5y8c77/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[7]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam28-ib0 + rank : 47 (local_rank: 7) + exitcode : 1 (pid: 3915829) + error_file: /tmp/torchelastic_pyumatzt/none_8h5y8c77/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam28-ib0 + rank : 40 (local_rank: 0) + exitcode : 1 (pid: 3915822) + error_file: /tmp/torchelastic_pyumatzt/none_8h5y8c77/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam31-ib0 + rank : 57 (local_rank: 1) + exitcode : 1 (pid: 827546) + error_file: /tmp/torchelastic_f40e9npq/none_uaayriol/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[2]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam31-ib0 + rank : 58 (local_rank: 2) + exitcode : 1 (pid: 827547) + error_file: /tmp/torchelastic_f40e9npq/none_uaayriol/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[3]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam31-ib0 + rank : 59 (local_rank: 3) + exitcode : 1 (pid: 827548) + error_file: /tmp/torchelastic_f40e9npq/none_uaayriol/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[4]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam31-ib0 + rank : 60 (local_rank: 4) + exitcode : 1 (pid: 827549) + error_file: /tmp/torchelastic_f40e9npq/none_uaayriol/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[5]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam31-ib0 + rank : 61 (local_rank: 5) + exitcode : 1 (pid: 827550) + error_file: /tmp/torchelastic_f40e9npq/none_uaayriol/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[6]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam31-ib0 + rank : 62 (local_rank: 6) + exitcode : 1 (pid: 827551) + error_file: /tmp/torchelastic_f40e9npq/none_uaayriol/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +[7]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam31-ib0 + rank : 63 (local_rank: 7) + exitcode : 1 (pid: 827552) + error_file: /tmp/torchelastic_f40e9npq/none_uaayriol/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:16:44 + host : jean-zay-iam31-ib0 + rank : 56 (local_rank: 0) + exitcode : 1 (pid: 827545) + error_file: /tmp/torchelastic_f40e9npq/none_uaayriol/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 187, in check_ckpt_list + assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + AssertionError: checkpoint count 1 is different from saved mp_world_size 4 + +============================================================ +srun: error: jean-zay-iam12: task 3: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2075637.0 +srun: error: jean-zay-iam29: task 6: Exited with exit code 1 +slurmstepd: error: *** STEP 2075637.0 ON jean-zay-iam05 CANCELLED AT 2022-10-06T13:16:51 *** +srun: error: jean-zay-iam27: task 4: Exited with exit code 1 +srun: error: jean-zay-iam11: task 2: Exited with exit code 1 +srun: error: jean-zay-iam28: task 5: Exited with exit code 1 +srun: error: jean-zay-iam05: task 0: Exited with exit code 1 +srun: error: jean-zay-iam31: task 7: Exited with exit code 1 +srun: error: jean-zay-iam10: task 1: Exited with exit code 1 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:using world size: 64, data-parallel-size: 16, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 16 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2075886.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 6144 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 1536 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 96 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 64 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 128 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-06 13:47:30,864] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default7]:> setting tensorboard ... +[default0]:> initializing tensor model parallel with size 2 +[default0]:> initializing pipeline model parallel with size 2 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-06 13:47:36,615] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.055 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 6.525 seconds +[default0]:time to initialize megatron (seconds): -32.802 +[default0]:[after megatron is initialized] datetime: 2022-10-06 13:47:43 +[default0]:building GPT model ... +[default0]:[2022-10-06 13:47:43,231] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-06 13:47:43,231] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-06 13:47:43,232] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.12 GB, percent = 6.6% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=1, model=0): 2, ProcessCoord(pipe=0, data=1, model=1): 3, ProcessCoord(pipe=0, data=2, model=0): 4, ProcessCoord(pipe=0, data=2, model=1): 5, ProcessCoord(pipe=0, data=3, model=0): 6, ProcessCoord(pipe=0, data=3, model=1): 7, ProcessCoord(pipe=0, data=4, model=0): 8, ProcessCoord(pipe=0, data=4, model=1): 9, ProcessCoord(pipe=0, data=5, model=0): 10, ProcessCoord(pipe=0, data=5, model=1): 11, ProcessCoord(pipe=0, data=6, model=0): 12, ProcessCoord(pipe=0, data=6, model=1): 13, ProcessCoord(pipe=0, data=7, model=0): 14, ProcessCoord(pipe=0, data=7, model=1): 15, ProcessCoord(pipe=0, data=8, model=0): 16, ProcessCoord(pipe=0, data=8, model=1): 17, ProcessCoord(pipe=0, data=9, model=0): 18, ProcessCoord(pipe=0, data=9, model=1): 19, ProcessCoord(pipe=0, data=10, model=0): 20, ProcessCoord(pipe=0, data=10, model=1): 21, ProcessCoord(pipe=0, data=11, model=0): 22, ProcessCoord(pipe=0, data=11, model=1): 23, ProcessCoord(pipe=0, data=12, model=0): 24, ProcessCoord(pipe=0, data=12, model=1): 25, ProcessCoord(pipe=0, data=13, model=0): 26, ProcessCoord(pipe=0, data=13, model=1): 27, ProcessCoord(pipe=0, data=14, model=0): 28, ProcessCoord(pipe=0, data=14, model=1): 29, ProcessCoord(pipe=0, data=15, model=0): 30, ProcessCoord(pipe=0, data=15, model=1): 31, ProcessCoord(pipe=1, data=0, model=0): 32, ProcessCoord(pipe=1, data=0, model=1): 33, ProcessCoord(pipe=1, data=1, model=0): 34, ProcessCoord(pipe=1, data=1, model=1): 35, ProcessCoord(pipe=1, data=2, model=0): 36, ProcessCoord(pipe=1, data=2, model=1): 37, ProcessCoord(pipe=1, data=3, model=0): 38, ProcessCoord(pipe=1, data=3, model=1): 39, ProcessCoord(pipe=1, data=4, model=0): 40, ProcessCoord(pipe=1, data=4, model=1): 41, ProcessCoord(pipe=1, data=5, model=0): 42, ProcessCoord(pipe=1, data=5, model=1): 43, ProcessCoord(pipe=1, data=6, model=0): 44, ProcessCoord(pipe=1, data=6, model=1): 45, ProcessCoord(pipe=1, data=7, model=0): 46, ProcessCoord(pipe=1, data=7, model=1): 47, ProcessCoord(pipe=1, data=8, model=0): 48, ProcessCoord(pipe=1, data=8, model=1): 49, ProcessCoord(pipe=1, data=9, model=0): 50, ProcessCoord(pipe=1, data=9, model=1): 51, ProcessCoord(pipe=1, data=10, model=0): 52, ProcessCoord(pipe=1, data=10, model=1): 53, ProcessCoord(pipe=1, data=11, model=0): 54, ProcessCoord(pipe=1, data=11, model=1): 55, ProcessCoord(pipe=1, data=12, model=0): 56, ProcessCoord(pipe=1, data=12, model=1): 57, ProcessCoord(pipe=1, data=13, model=0): 58, ProcessCoord(pipe=1, data=13, model=1): 59, ProcessCoord(pipe=1, data=14, model=0): 60, ProcessCoord(pipe=1, data=14, model=1): 61, ProcessCoord(pipe=1, data=15, model=0): 62, ProcessCoord(pipe=1, data=15, model=1): 63} +[default0]:[2022-10-06 13:47:44,085] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=15 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]:stage=1 layers=16 +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-06 13:47:44,923] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-06 13:47:44,924] [INFO] [utils.py:828:see_memory_usage] MA 0.68 GB Max_MA 0.68 GB CA 0.71 GB Max_CA 1 GB +[default0]:[2022-10-06 13:47:44,924] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.54 GB, percent = 6.7% +[default0]:setting training iterations to 3100 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-06 13:47:44,925] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-06 13:47:45,901] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default0]:[2022-10-06 13:47:45,902] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default0]:[2022-10-06 13:47:45,902] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default0]:[2022-10-06 13:47:45,904] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2022-10-06 13:47:45,904] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[default0]:[2022-10-06 13:47:45,904] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer +[default0]:[2022-10-06 13:47:45,904] [INFO] [stage_1_and_2.py:134:__init__] Reduce bucket size 500000000 +[default0]:[2022-10-06 13:47:45,904] [INFO] [stage_1_and_2.py:135:__init__] Allgather bucket size 500000000 +[default0]:[2022-10-06 13:47:45,904] [INFO] [stage_1_and_2.py:136:__init__] CPU Offload: False +[default0]:[2022-10-06 13:47:45,904] [INFO] [stage_1_and_2.py:137:__init__] Round robin gradient partitioning: False +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Emitting ninja build file /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113/utils/build.ninja... +[default3]:Building extension module utils... +[default3]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.3213648796081543 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.3302881717681885 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.32122302055358887 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.3302953243255615 seconds +[default5]:Loading extension module utils... +[default0]:Loading extension module utils... +[default5]:Time to load utils op: 0.3212592601776123 seconds +[default0]:Time to load utils op: 0.33029866218566895 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.31959056854248047 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.32829809188842773 seconds +[default0]:Loading extension module utils... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.328289270401001 seconds +[default0]:Time to load utils op: 0.3282806873321533 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.3282895088195801 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.3192310333251953 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.3192412853240967 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.31937718391418457 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.29586005210876465 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.30910396575927734 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.295928955078125 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.2957913875579834 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.3080461025238037 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.29593443870544434 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.3092765808105469 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.3095710277557373 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.3303201198577881 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.416109561920166 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.321868896484375 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.4161057472229004 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.4161250591278076 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4161243438720703 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.416095495223999 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.41611814498901367 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.41609621047973633 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.4161109924316406 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.4228990077972412 seconds +[default3]:ninja: no work to do. +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.3928864002227783 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.4229881763458252 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.420971155166626 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.4229905605316162 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.42299747467041016 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.4197382926940918 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.42136144638061523 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.419222354888916 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.4258239269256592 seconds +[default2]:Loading extension module utils... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.424724817276001 seconds +[default2]:Time to load utils op: 0.4235687255859375 seconds +[default3]:Loading extension module utils... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.4272146224975586 seconds +[default3]:Time to load utils op: 0.4154033660888672 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.4246993064880371 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.4244656562805176 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4238455295562744 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.33373355865478516 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.3211557865142822 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.32440996170043945 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.32534241676330566 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.3336954116821289 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.3336973190307617 seconds +[default4]:Loading extension module utils... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.32329511642456055 seconds +[default4]:Time to load utils op: 0.3336963653564453 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.4874558448791504 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.4861571788787842 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.4874849319458008 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.48793506622314453 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.48444342613220215 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.48760032653808594 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.48722362518310547 seconds +[default6]:Rank: 46 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 47 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 42 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 43 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 10 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 2 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 3 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 48 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 8 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 11 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 16 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 33 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 49 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 50 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 9 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 17 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 14 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 15 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 4 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 1 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 7 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 6 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 5 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 32 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 39 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 34 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 51 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 54 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 25 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 31 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 30 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 27 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 24 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 18 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 22 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 23 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 19 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 13 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 12 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 0 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 35 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Rank: 38 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 36 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 37 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Rank: 55 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 53 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 52 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 26 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 29 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 28 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 20 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 21 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 60 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 61 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default4]:Rank: 44 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default5]:Rank: 45 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Rank: 40 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 41 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default1]:Rank: 57 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Rank: 58 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default3]:Rank: 59 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0016598701477050781 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0017843246459960938 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.002524852752685547 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.001806020736694336 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0016002655029296875 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0020189285278320312 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0021517276763916016 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Rank: 62 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.002347230911254883 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0026063919067382812 seconds +[default0]:Rank: 56 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0019223690032958984 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0020411014556884766 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.001837015151977539 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0018820762634277344 seconds +[default7]:Rank: 63 partition count [16, 16, 16] and sizes[(17055744, False), (5603328, False), (11136, False)] +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0020754337310791016 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0020303726196289062 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.001990079879760742 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0016980171203613281 seconds +[default3]:Time to load utils op: 0.001962900161743164 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0030965805053710938 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default6]:Time to load utils op: 0.0023772716522216797 seconds +[default2]:Time to load utils op: 0.0020444393157958984 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0017251968383789062 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0024733543395996094 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0025262832641601562 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.002026081085205078 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0016248226165771484 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0018911361694335938 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0021543502807617188 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0025398731231689453 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0026531219482421875 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default3]:Time to load utils op: 0.001957416534423828 seconds +[default4]:Time to load utils op: 0.0017390251159667969 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0018579959869384766 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0018432140350341797 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0018668174743652344 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0015912055969238281 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.001491546630859375 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0016105175018310547 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0014519691467285156 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0028150081634521484 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0014200210571289062 seconds +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.001959085464477539 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0020940303802490234 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0022084712982177734 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0015246868133544922 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0012738704681396484 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0028853416442871094 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0026421546936035156 seconds +[default0]:[2022-10-06 13:47:47,955] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states +[default0]:[2022-10-06 13:47:47,956] [INFO] [utils.py:828:see_memory_usage] MA 0.76 GB Max_MA 0.77 GB CA 1.22 GB Max_CA 1 GB +[default0]:[2022-10-06 13:47:47,956] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.74 GB, percent = 7.1% +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.003001689910888672 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0025260448455810547 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0026807785034179688 seconds +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0026035308837890625 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.003300905227661133 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0028891563415527344 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0029387474060058594 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0010151863098144531 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0008375644683837891 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0008685588836669922 seconds +[default1]:Time to load utils op: 0.0009818077087402344 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0009849071502685547 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0011086463928222656 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0015478134155273438 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.001623392105102539 seconds +[default0]:[2022-10-06 13:47:48,019] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states +[default0]:[2022-10-06 13:47:48,019] [INFO] [utils.py:828:see_memory_usage] MA 0.93 GB Max_MA 1.02 GB CA 1.39 GB Max_CA 1 GB +[default0]:[2022-10-06 13:47:48,019] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.74 GB, percent = 7.1% +[default0]:[2022-10-06 13:47:48,019] [INFO] [stage_1_and_2.py:516:__init__] optimizer state initialized +[default0]:[2022-10-06 13:47:48,048] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer +[default0]:[2022-10-06 13:47:48,048] [INFO] [utils.py:828:see_memory_usage] MA 0.93 GB Max_MA 0.93 GB CA 1.39 GB Max_CA 1 GB +[default0]:[2022-10-06 13:47:48,048] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.74 GB, percent = 7.1% +[default0]:[2022-10-06 13:47:48,048] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2022-10-06 13:47:48,048] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2022-10-06 13:47:48,048] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2022-10-06 13:47:48,048] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2022-10-06 13:47:48,049] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] gradient_accumulation_steps .. 128 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] train_batch_size ............. 2048 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 1 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] world_size ................... 16 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] zero_enabled ................. True +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:991:print] zero_optimization_stage ...... 1 +[default0]:[2022-10-06 13:47:48,050] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 1, +[default0]: "train_batch_size": 2.048000e+03, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 1 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.00045108795166015625 seconds +[default0]:[2022-10-06 13:47:48,051] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=128 micro_batch_size=1 +[default0]:[2022-10-06 13:47:48,534] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default1]:[2022-10-06 13:47:48,534] [INFO] [engine.py:145:__init__] RANK=1 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default0]:[2022-10-06 13:47:48,536] [INFO] [engine.py:145:__init__] RANK=32 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default1]:[2022-10-06 13:47:48,534] [INFO] [engine.py:145:__init__] RANK=33 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=362723328 (362.723M) TOTAL_PARAMS=1450893312 (1450.893M) UNIQUE_PARAMS=1065541632 (1065.542M) +[default2]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:47:48,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:47:48,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:47:48,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 13:47:50,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:50,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default5]:[2022-10-06 13:47:50,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:47:50,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default1]:[2022-10-06 13:47:50,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:47:50,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default3]:[2022-10-06 13:47:50,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:47:50,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default2]:[2022-10-06 13:47:50,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:47:50,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:47:50,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-06 13:47:50,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:50,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:47:50,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:50,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:50,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default6]:[2022-10-06 13:47:50,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:50,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default2]:[2022-10-06 13:47:50,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:47:50,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:50,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:47:50,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default3]:[2022-10-06 13:47:50,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:47:50,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-06 13:47:50,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:50,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:47:50,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:47:50,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:47:50,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:50,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:47:50,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:50,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:47:50,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:47:50,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default5]:[2022-10-06 13:47:50,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default0]:[2022-10-06 13:47:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:50,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-06 13:47:50,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:47:50,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-06 13:47:50,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:47:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default5]:[2022-10-06 13:47:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:47:50,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-06 13:47:50,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:47:50,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default7]:[2022-10-06 13:47:50,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:50,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:50,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:50,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default4]:[2022-10-06 13:47:50,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:50,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default7]:[2022-10-06 13:47:50,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-06 13:47:50,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:47:50,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default4]:[2022-10-06 13:47:51,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:51,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:47:51,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:47:51,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default7]:[2022-10-06 13:47:51,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:47:51,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default6]:[2022-10-06 13:47:51,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:51,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:47:51,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:47:51,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default0]:[2022-10-06 13:47:51,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:51,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-06 13:47:51,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:47:51,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-06 13:47:51,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:47:51,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default6]:[2022-10-06 13:47:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:51,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:47:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:47:51,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default4]:[2022-10-06 13:47:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:51,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 13:47:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:47:51,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default5]:[2022-10-06 13:47:51,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:47:51,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:51,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default6]:[2022-10-06 13:47:51,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:51,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default4]:[2022-10-06 13:47:51,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:51,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default3]:[2022-10-06 13:47:51,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:47:51,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default5]:[2022-10-06 13:47:51,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:47:51,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default6]:[2022-10-06 13:47:51,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:51,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default2]:[2022-10-06 13:47:51,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:47:51,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-06 13:47:51,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:47:51,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default7]:[2022-10-06 13:47:51,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:47:51,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default0]:[2022-10-06 13:47:51,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:51,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default7]:[2022-10-06 13:47:51,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:47:51,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default4]:[2022-10-06 13:47:51,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:51,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default5]:[2022-10-06 13:47:51,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 13:47:51,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default4]:[2022-10-06 13:47:51,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:51,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 13:47:51,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 13:47:51,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-06 13:47:51,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:51,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 13:47:51,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:51,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 13:47:51,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:47:51,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 13:47:51,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:47:51,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default3]:[2022-10-06 13:47:51,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:47:51,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default5]:[2022-10-06 13:47:51,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default7]:[2022-10-06 13:47:51,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default6]:[2022-10-06 13:47:51,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:51,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 13:47:51,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 13:47:51,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default1]:[2022-10-06 13:47:51,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 13:47:51,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default7]:[2022-10-06 13:47:52,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-06 13:47:52,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 13:47:52,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-06 13:47:52,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 13:47:52,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 13:47:52,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default7]:[2022-10-06 13:47:52,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default5]:[2022-10-06 13:47:52,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 13:47:52,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-06 13:47:52,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default7]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default7]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default7]: layer.load_state_dict(checkpoint) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default7]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default7]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default7]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default7]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default7]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default7]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default7]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default7]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default7]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default5]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default5]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default5]: layer.load_state_dict(checkpoint) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default5]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default5]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default5]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default5]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default5]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default5]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default5]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default5]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default5]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]:[2022-10-06 13:47:52,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-06 13:47:52,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 13:47:52,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-06 13:47:52,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 13:47:52,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-06 13:47:52,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 13:47:52,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 13:47:52,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default6]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default6]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default6]: layer.load_state_dict(checkpoint) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default6]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default6]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default6]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default6]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default6]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default6]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default6]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default6]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default6]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]:[2022-10-06 13:47:52,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 13:47:52,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default3]:[2022-10-06 13:47:52,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 13:47:52,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-06 13:47:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:47:52,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:52,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:52,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 13:47:52,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-06 13:47:52,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 13:47:52,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-06 13:47:52,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:52,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-06 13:47:52,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:52,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 13:47:52,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:52,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default2]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default2]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default2]: layer.load_state_dict(checkpoint) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default2]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default2]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default2]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default2]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default2]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default2]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default2]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default2]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default2]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]:[2022-10-06 13:47:52,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 13:47:52,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-06 13:47:52,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:53,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:47:53,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:53,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:47:53,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:53,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:53,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:53,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:53,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 13:47:53,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:53,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:53,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:53,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 13:47:53,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 13:47:53,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]:[2022-10-06 13:47:52,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 13:47:52,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default4]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default4]:[2022-10-06 13:47:53,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default4]: layer.load_state_dict(checkpoint) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default4]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default4]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default4]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default4]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default4]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default4]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default4]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default4]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default4]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]:[2022-10-06 13:47:53,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-06 13:47:53,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 13:47:53,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-06 13:47:53,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 13:47:53,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-06 13:47:53,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default1]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default1]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default1]: layer.load_state_dict(checkpoint) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default1]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default1]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default1]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default1]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default1]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default3]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default3]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default3]: layer.load_state_dict(checkpoint) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default3]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default3]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default3]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default3]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default3]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]:[2022-10-06 13:47:53,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 13:47:53,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:53,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default0]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default0]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default0]: layer.load_state_dict(checkpoint) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default0]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default1]:[2022-10-06 13:47:53,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 13:47:53,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default3]:[2022-10-06 13:47:53,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-06 13:47:53,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 13:47:53,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-06 13:47:53,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 13:47:53,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-06 13:47:53,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:53,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 13:47:53,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 13:47:53,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 13:47:53,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default4]:[2022-10-06 13:47:53,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:53,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:53,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 13:47:53,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 13:47:53,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 13:47:53,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default0]:[2022-10-06 13:47:53,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:53,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:53,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:53,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 13:47:53,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 13:47:53,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default3]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default3]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default3]: layer.load_state_dict(checkpoint) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default3]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default3]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default3]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default3]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default3]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default0]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default0]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default0]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default1]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default1]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default1]: layer.load_state_dict(checkpoint) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default1]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default1]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default1]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default1]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default1]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]:[2022-10-06 13:47:53,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-06 13:47:53,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default5]:[2022-10-06 13:47:53,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-06 13:47:53,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default1]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default1]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default1]: layer.load_state_dict(checkpoint) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default1]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default1]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default1]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default1]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default1]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default1]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default1]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default1]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default1]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]:[2022-10-06 13:47:53,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 13:47:53,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default3]:[2022-10-06 13:47:53,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default5]:[2022-10-06 13:47:53,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 13:47:53,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-06 13:47:53,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default3]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default3]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default3]: layer.load_state_dict(checkpoint) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default3]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default1]:[2022-10-06 13:47:53,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 13:47:53,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-06 13:47:53,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default3]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default3]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default3]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default3]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default3]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default3]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default3]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]:[2022-10-06 13:47:53,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-06 13:47:53,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 13:47:53,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-06 13:47:53,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 13:47:53,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-06 13:47:53,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 13:47:53,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 13:47:53,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:53,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default2]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default2]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default2]: layer.load_state_dict(checkpoint) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default2]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default2]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default2]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default2]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default2]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]:[2022-10-06 13:47:53,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 13:47:53,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:53,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 13:47:53,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:53,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:53,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default0]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default0]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default0]: layer.load_state_dict(checkpoint) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default0]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default0]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default0]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default0]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default0]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default5]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default5]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default5]: layer.load_state_dict(checkpoint) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default5]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default5]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default5]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default5]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default5]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default5]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default5]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default5]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default5]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default2]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default2]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default2]: layer.load_state_dict(checkpoint) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default2]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default2]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default2]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default2]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default2]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default6]:[2022-10-06 13:47:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default0]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default0]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default0]: layer.load_state_dict(checkpoint) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default0]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default0]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default0]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default0]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default0]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default2]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default2]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default2]: layer.load_state_dict(checkpoint) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default2]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default2]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default2]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default2]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default2]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]:[2022-10-06 13:47:53,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default6]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default6]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default6]: layer.load_state_dict(checkpoint) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default6]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default6]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default6]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default6]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default6]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]:[2022-10-06 13:47:53,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 13:47:53,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-06 13:47:53,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-06 13:47:53,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 13:47:53,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default7]:[2022-10-06 13:47:53,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default4]:[2022-10-06 13:47:53,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 13:47:53,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 13:47:53,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 13:47:53,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 13:47:53,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default4]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default4]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default4]: layer.load_state_dict(checkpoint) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default4]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default4]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default4]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default4]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default4]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default4]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default4]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default4]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default4]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default7]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default7]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default7]: layer.load_state_dict(checkpoint) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default7]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default7]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default7]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default7]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default7]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default7]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default7]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default7]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default7]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default5]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default5]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default5]: layer.load_state_dict(checkpoint) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default5]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default5]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default5]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default5]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default5]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default5]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default5]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default5]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default5]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default6]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default6]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default6]: layer.load_state_dict(checkpoint) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default6]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default6]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default6]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default6]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default6]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default6]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default6]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default6]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default6]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default3]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default3]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default3]: layer.load_state_dict(checkpoint) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default3]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default3]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default3]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default3]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default3]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]:[2022-10-06 13:47:53,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default6]:[2022-10-06 13:47:53,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default6]:[2022-10-06 13:47:53,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default1]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default1]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default1]: layer.load_state_dict(checkpoint) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default1]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default1]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default1]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default1]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default1]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]:[2022-10-06 13:47:53,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default4]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default4]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default4]: layer.load_state_dict(checkpoint) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default4]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default4]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default4]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default4]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default4]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]:[2022-10-06 13:47:53,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default6]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default6]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default6]: layer.load_state_dict(checkpoint) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default6]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default6]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default6]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default6]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default6]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]:[2022-10-06 13:47:53,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-06 13:47:53,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:53,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:53,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:53,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default0]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default0]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default0]: layer.load_state_dict(checkpoint) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default0]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default0]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default0]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default0]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default0]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default0]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default0]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default0]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default0]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]:[2022-10-06 13:47:53,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-06 13:47:53,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:53,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 13:47:53,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:53,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 13:47:53,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-06 13:47:53,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-06 13:47:53,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 13:47:53,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 13:47:53,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-06 13:47:53,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 13:47:53,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-06 13:47:53,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:53,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-06 13:47:53,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:53,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:53,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 13:47:53,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-06 13:47:53,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default2]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default2]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default2]: layer.load_state_dict(checkpoint) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default2]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default2]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default2]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default2]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default2]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default2]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default2]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default2]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default2]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]:[2022-10-06 13:47:53,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 13:47:53,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 13:47:53,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default5]:[2022-10-06 13:47:53,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default6]:[2022-10-06 13:47:53,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:53,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 13:47:53,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-06 13:47:53,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 13:47:53,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-06 13:47:53,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 13:47:53,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 13:47:53,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default4]:[2022-10-06 13:47:53,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:53,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 13:47:53,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 13:47:53,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 13:47:53,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 13:47:53,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default0]:[2022-10-06 13:47:53,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 13:47:53,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-06 13:47:53,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 13:47:53,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 13:47:53,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default3]:[2022-10-06 13:47:53,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-06 13:47:53,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 13:47:53,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-06 13:47:53,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default0]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default0]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default0]: layer.load_state_dict(checkpoint) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default0]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default0]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default0]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default0]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default0]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default0]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default0]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default0]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default0]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default1]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default1]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default1]: layer.load_state_dict(checkpoint) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default1]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default1]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default1]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default1]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default1]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default1]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default1]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default1]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default1]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default3]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default3]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default3]: layer.load_state_dict(checkpoint) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default3]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default3]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default3]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default3]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default3]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default3]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default3]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default3]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default3]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]:[2022-10-06 13:47:54,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default5]:[2022-10-06 13:47:54,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default6]:[2022-10-06 13:47:54,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 13:47:54,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 13:47:54,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 13:47:54,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default7]:[2022-10-06 13:47:54,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 13:47:54,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default7]:[2022-10-06 13:47:54,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default5]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default5]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default5]: layer.load_state_dict(checkpoint) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default5]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default5]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default5]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default5]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default5]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]:[2022-10-06 13:47:54,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default5]:[2022-10-06 13:47:54,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default4]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default4]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default4]: layer.load_state_dict(checkpoint) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default4]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default4]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default4]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default4]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default4]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default7]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default7]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default7]: layer.load_state_dict(checkpoint) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default7]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default7]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default7]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default7]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default7]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default5]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default5]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default5]: layer.load_state_dict(checkpoint) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default5]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default5]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default5]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default5]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default5]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]:Traceback (most recent call last): +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default6]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default7]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default7]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default7]: layer.load_state_dict(checkpoint) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default6]: layer.load_state_dict(checkpoint) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default7]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default6]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default6]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default7]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default7]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default7]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default6]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default7]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default6]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]:[2022-10-06 13:47:54,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-06 13:47:54,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 13:47:54,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 13:47:54,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 13:47:54,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 13:47:54,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-06 13:47:54,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 13:47:54,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 13:47:54,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 13:47:54,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default4]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default4]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default4]: layer.load_state_dict(checkpoint) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default4]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default4]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default4]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default4]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default4]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default4]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default4]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default4]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default4]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]:[2022-10-06 13:47:54,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 13:47:54,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:54,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default6]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default6]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default6]: layer.load_state_dict(checkpoint) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default6]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default6]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default6]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default6]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default6]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default6]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default6]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default6]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default6]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]:Traceback (most recent call last): +[default5]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default7]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default7]: layer.load_state_dict(checkpoint) +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default5]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default7]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default5]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default7]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default5]: layer.load_state_dict(checkpoint) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default7]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default7]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default5]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default5]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default5]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default5]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default7]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]:[2022-10-06 13:47:54,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-06 13:47:54,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:54,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-06 13:47:54,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 13:47:54,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-06 13:47:54,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 13:47:54,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default4]:[2022-10-06 13:47:54,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 13:47:54,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 13:47:54,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default4]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default4]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default4]: layer.load_state_dict(checkpoint) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default4]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default4]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default4]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default4]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default4]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]:[2022-10-06 13:47:54,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default0]:[2022-10-06 13:47:54,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:54,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:54,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default2]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default2]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default2]: layer.load_state_dict(checkpoint) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default2]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default2]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default2]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default2]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default2]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default2]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default2]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default2]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default2]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]:[2022-10-06 13:47:54,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-06 13:47:54,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 13:47:54,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 13:47:54,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-06 13:47:54,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-06 13:47:54,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 13:47:54,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 13:47:54,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default2]:[2022-10-06 13:47:54,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 13:47:54,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:54,299] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 13:47:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 13:47:54,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default7]:[2022-10-06 13:47:54,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default0]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default0]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default0]: layer.load_state_dict(checkpoint) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default0]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default0]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default0]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default0]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default0]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default0]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default0]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default0]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default0]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default7]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default7]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default7]: layer.load_state_dict(checkpoint) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default7]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default7]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default7]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default7]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default7]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default7]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default7]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default7]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default7]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default3]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default3]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default3]: layer.load_state_dict(checkpoint) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default3]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default3]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default3]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default3]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default3]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default3]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default3]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default3]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default3]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]:[2022-10-06 13:47:54,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-06 13:47:54,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default1]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default1]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default1]: layer.load_state_dict(checkpoint) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default1]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default1]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default1]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default1]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default1]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default1]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default1]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default1]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default1]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]:[2022-10-06 13:47:54,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-06 13:47:54,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 13:47:54,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-06 13:47:54,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 13:47:54,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-06 13:47:54,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:54,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:54,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 13:47:54,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 13:47:54,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:54,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 13:47:54,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 13:47:54,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 13:47:54,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default0]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default0]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default0]: layer.load_state_dict(checkpoint) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default0]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default0]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default0]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default0]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default0]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default0]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default0]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default0]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default0]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]:[2022-10-06 13:47:54,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-06 13:47:54,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:54,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 13:47:54,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:54,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 13:47:54,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-06 13:47:54,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 13:47:54,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 13:47:54,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default7]:[2022-10-06 13:47:54,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-06 13:47:54,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:54,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default1]:[2022-10-06 13:47:54,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default0]:[2022-10-06 13:47:54,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:54,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:54,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:54,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 13:47:54,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 13:47:54,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 13:47:54,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default4]:[2022-10-06 13:47:54,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-06 13:47:54,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 13:47:54,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 13:47:54,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 13:47:54,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 13:47:54,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-06 13:47:54,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-06 13:47:54,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 13:47:54,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 13:47:54,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 13:47:54,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 13:47:54,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 13:47:54,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 13:47:54,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 13:47:54,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default5]:[2022-10-06 13:47:54,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default6]:[2022-10-06 13:47:54,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 13:47:54,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]:[2022-10-06 13:47:54,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default1]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default1]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default1]: layer.load_state_dict(checkpoint) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default1]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default1]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default1]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default1]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default1]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default1]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]:[2022-10-06 13:47:54,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default4]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default4]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default4]: layer.load_state_dict(checkpoint) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default4]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default5]:[2022-10-06 13:47:54,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 13:47:54,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-06 13:47:54,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default1]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default1]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default1]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]:[2022-10-06 13:47:54,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]:[2022-10-06 13:47:54,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-06 13:47:54,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default7]:[2022-10-06 13:47:54,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default6]:[2022-10-06 13:47:54,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-06 13:47:54,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 13:47:54,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 13:47:54,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 13:47:54,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default4]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default4]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default4]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default2]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default2]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default4]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default4]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default4]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: layer.load_state_dict(checkpoint) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default2]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default2]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default2]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default2]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default2]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default2]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default2]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default2]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default2]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]:[2022-10-06 13:47:54,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default3]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default3]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default3]: layer.load_state_dict(checkpoint) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default3]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default3]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default3]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default3]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default3]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default3]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default3]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default3]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default3]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]:[2022-10-06 13:47:54,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:Traceback (most recent call last): +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default7]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default6]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default6]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default6]: layer.load_state_dict(checkpoint) +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default6]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default6]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default5]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default5]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default6]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default6]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default5]: layer.load_state_dict(checkpoint) +[default6]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default6]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default6]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default5]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default5]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default7]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default6]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default7]: layer.load_state_dict(checkpoint) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default5]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default5]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default6]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default6]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default7]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default7]:RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: +[default7]: size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). +[default7]: size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). +[default7]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default5]: size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +[default5]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default5]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default5]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default7]: size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). +[default7]: size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). +[default7]: size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). +[default7]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]:[2022-10-06 13:47:54,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:54,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default5]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default5]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default5]: layer.load_state_dict(checkpoint) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default5]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default5]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default5]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default5]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default5]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default5]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default7]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default7]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default7]: layer.load_state_dict(checkpoint) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default7]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default7]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default7]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default7]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default7]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default7]:[2022-10-06 13:47:54,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default5]:[2022-10-06 13:47:54,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default0]:[2022-10-06 13:47:54,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 13:47:54,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-06 13:47:54,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 13:47:54,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default2]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default2]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default2]: layer.load_state_dict(checkpoint) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default2]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default4]:[2022-10-06 13:47:54,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default2]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default2]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default2]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default2]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default3]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default3]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default3]: layer.load_state_dict(checkpoint) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default3]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default3]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default3]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default3]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default3]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default3]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default2]:[2022-10-06 13:47:54,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default4]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default4]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default4]: layer.load_state_dict(checkpoint) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default4]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default4]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default4]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default4]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default4]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default4]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default6]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default6]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default6]: layer.load_state_dict(checkpoint) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default6]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default6]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default6]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default6]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default6]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default6]:[2022-10-06 13:47:54,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default0]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default0]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default0]: layer.load_state_dict(checkpoint) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default0]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default0]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default0]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default0]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default0]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default0]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]:[2022-10-06 13:47:54,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint +[default1]: self.load_module_state_dict(state_dict=checkpoint['module'], +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict +[default1]: self.module.load_state_dir(load_dir=self._curr_ckpt_path, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir +[default1]: layer.load_state_dict(checkpoint) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict +[default1]: self.word_embeddings.load_state_dict(state_dict_, strict=strict) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict +[default1]: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( +[default1]:RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +[default1]: size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). +[default1]: size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +[default1]: size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3441133 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2800004 closing signal SIGTERM +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 4 (pid: 893481) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3738864) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3528994) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1096788) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 2 (pid: 2874075) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3917402) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 2800005) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 3441134) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + return _run_code(code, main_globals, None, + exec(code, run_globals) +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + elastic_launch( + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + raise ChildFailedError( + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + elastic_launch( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam10-ib0 + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 2800006) + error_file: /tmp/torchelastic_gcbdfh6l/none_n8mtfq_g/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + return launch_agent(self._config, self._entrypoint, list(args)) + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[2]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam10-ib0 + rank : 3 (local_rank: 3) + exitcode : 1 (pid: 2800007) + error_file: /tmp/torchelastic_gcbdfh6l/none_n8mtfq_g/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + raise ChildFailedError( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[3]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam10-ib0 + rank : 4 (local_rank: 4) + exitcode : 1 (pid: 2800008) + error_file: /tmp/torchelastic_gcbdfh6l/none_n8mtfq_g/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + return launch_agent(self._config, self._entrypoint, list(args)) + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[4]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam10-ib0 + rank : 5 (local_rank: 5) + exitcode : 1 (pid: 2800009) + error_file: /tmp/torchelastic_gcbdfh6l/none_n8mtfq_g/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[5]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam10-ib0 + rank : 6 (local_rank: 6) + exitcode : 1 (pid: 2800010) + error_file: /tmp/torchelastic_gcbdfh6l/none_n8mtfq_g/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + main() + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam27-ib0 + rank : 50 (local_rank: 2) + exitcode : 1 (pid: 3441135) + error_file: /tmp/torchelastic_4yqinula/none_f5pnx2f_/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[6]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam10-ib0 + rank : 7 (local_rank: 7) + exitcode : 1 (pid: 2800013) + error_file: /tmp/torchelastic_gcbdfh6l/none_n8mtfq_g/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + raise ChildFailedError( + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam10-ib0 + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 2800005) + error_file: /tmp/torchelastic_gcbdfh6l/none_n8mtfq_g/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +============================================================ + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam22-ib0 + rank : 25 (local_rank: 1) + exitcode : 1 (pid: 3738865) + error_file: /tmp/torchelastic_1b1s0q2k/none_ptdin831/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[2]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam22-ib0 + rank : 26 (local_rank: 2) + exitcode : 1 (pid: 3738866) + error_file: /tmp/torchelastic_1b1s0q2k/none_ptdin831/attempt_0/2/error.json + traceback : Traceback (most recent call last): + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[2]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam27-ib0 + rank : 51 (local_rank: 3) + exitcode : 1 (pid: 3441136) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + error_file: /tmp/torchelastic_4yqinula/none_f5pnx2f_/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[3]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam22-ib0 + rank : 27 (local_rank: 3) + exitcode : 1 (pid: 3738867) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + error_file: /tmp/torchelastic_1b1s0q2k/none_ptdin831/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[3]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam27-ib0 + rank : 52 (local_rank: 4) + exitcode : 1 (pid: 3441137) + error_file: /tmp/torchelastic_4yqinula/none_f5pnx2f_/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[4]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam22-ib0 + rank : 28 (local_rank: 4) + exitcode : 1 (pid: 3738868) + error_file: /tmp/torchelastic_1b1s0q2k/none_ptdin831/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[4]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam27-ib0 + rank : 53 (local_rank: 5) + exitcode : 1 (pid: 3441138) + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[5]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam22-ib0 + rank : 29 (local_rank: 5) + exitcode : 1 (pid: 3738869) + error_file: /tmp/torchelastic_1b1s0q2k/none_ptdin831/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + error_file: /tmp/torchelastic_4yqinula/none_f5pnx2f_/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[6]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam22-ib0 + rank : 30 (local_rank: 6) + exitcode : 1 (pid: 3738870) + error_file: /tmp/torchelastic_1b1s0q2k/none_ptdin831/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[5]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam27-ib0 + rank : 54 (local_rank: 6) + exitcode : 1 (pid: 3441139) + error_file: /tmp/torchelastic_4yqinula/none_f5pnx2f_/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[7]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam22-ib0 + rank : 31 (local_rank: 7) + exitcode : 1 (pid: 3738871) + load_path, client_states = self._load_checkpoint(load_dir, + error_file: /tmp/torchelastic_1b1s0q2k/none_ptdin831/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[6]: + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + time : 2022-10-06_13:47:54 + host : jean-zay-iam27-ib0 + rank : 55 (local_rank: 7) + exitcode : 1 (pid: 3441140) + error_file: /tmp/torchelastic_4yqinula/none_f5pnx2f_/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam22-ib0 + rank : 24 (local_rank: 0) + exitcode : 1 (pid: 3738864) + error_file: /tmp/torchelastic_1b1s0q2k/none_ptdin831/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +============================================================ + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam27-ib0 + rank : 49 (local_rank: 1) + exitcode : 1 (pid: 3441134) + error_file: /tmp/torchelastic_4yqinula/none_f5pnx2f_/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +============================================================ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + raise ChildFailedError( + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam11-ib0 + rank : 9 (local_rank: 1) + exitcode : 1 (pid: 3528995) + error_file: /tmp/torchelastic_bcghnyuz/none_segr8087/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam12-ib0 + rank : 17 (local_rank: 1) + exitcode : 1 (pid: 1096789) + error_file: /tmp/torchelastic_toa6gwld/none_6zn_ykno/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[2]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam11-ib0 + rank : 10 (local_rank: 2) + exitcode : 1 (pid: 3528996) + error_file: /tmp/torchelastic_bcghnyuz/none_segr8087/attempt_0/2/error.json + traceback : Traceback (most recent call last): + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[2]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam12-ib0 + rank : 18 (local_rank: 2) + exitcode : 1 (pid: 1096790) + error_file: /tmp/torchelastic_toa6gwld/none_6zn_ykno/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[3]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam11-ib0 + rank : 11 (local_rank: 3) + exitcode : 1 (pid: 3528997) + error_file: /tmp/torchelastic_bcghnyuz/none_segr8087/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[4]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam11-ib0 + rank : 12 (local_rank: 4) + exitcode : 1 (pid: 3528998) + error_file: /tmp/torchelastic_bcghnyuz/none_segr8087/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[3]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam12-ib0 + rank : 19 (local_rank: 3) + exitcode : 1 (pid: 1096791) + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + error_file: /tmp/torchelastic_toa6gwld/none_6zn_ykno/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[5]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam11-ib0 + rank : 13 (local_rank: 5) + exitcode : 1 (pid: 3528999) + error_file: /tmp/torchelastic_bcghnyuz/none_segr8087/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[4]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam12-ib0 + rank : 20 (local_rank: 4) + exitcode : 1 (pid: 1096792) + error_file: /tmp/torchelastic_toa6gwld/none_6zn_ykno/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[6]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam11-ib0 + rank : 14 (local_rank: 6) + exitcode : 1 (pid: 3529000) + error_file: /tmp/torchelastic_bcghnyuz/none_segr8087/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[7]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam11-ib0 + rank : 15 (local_rank: 7) + exitcode : 1 (pid: 3529001) + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[5]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam12-ib0 + rank : 21 (local_rank: 5) + exitcode : 1 (pid: 1096793) + error_file: /tmp/torchelastic_toa6gwld/none_6zn_ykno/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + error_file: /tmp/torchelastic_bcghnyuz/none_segr8087/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[6]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam12-ib0 + rank : 22 (local_rank: 6) + exitcode : 1 (pid: 1096794) + error_file: /tmp/torchelastic_toa6gwld/none_6zn_ykno/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam11-ib0 + rank : 8 (local_rank: 0) + exitcode : 1 (pid: 3528994) + error_file: /tmp/torchelastic_bcghnyuz/none_segr8087/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[7]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam12-ib0 + rank : 23 (local_rank: 7) + exitcode : 1 (pid: 1096795) + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + error_file: /tmp/torchelastic_toa6gwld/none_6zn_ykno/attempt_0/7/error.json + traceback : Traceback (most recent call last): + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +============================================================ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam12-ib0 + rank : 16 (local_rank: 0) + exitcode : 1 (pid: 1096788) + error_file: /tmp/torchelastic_toa6gwld/none_6zn_ykno/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/model/language_model.py", line 235, in load_state_dict + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: + size mismatch for weight: copying a param with shape torch.Size([125440, 2048]) from checkpoint, the shape in current model is torch.Size([125440, 1536]). + size mismatch for norm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for norm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +============================================================ + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[0]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam23-ib0 + rank : 32 (local_rank: 0) + exitcode : 1 (pid: 2874073) + error_file: /tmp/torchelastic_dw_znvh1/none_heu976xc/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[1]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam23-ib0 + rank : 33 (local_rank: 1) + exitcode : 1 (pid: 2874074) + error_file: /tmp/torchelastic_dw_znvh1/none_heu976xc/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[3]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam23-ib0 + rank : 35 (local_rank: 3) + exitcode : 1 (pid: 2874076) + error_file: /tmp/torchelastic_dw_znvh1/none_heu976xc/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[4]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam23-ib0 + rank : 36 (local_rank: 4) + exitcode : 1 (pid: 2874077) + error_file: /tmp/torchelastic_dw_znvh1/none_heu976xc/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[5]: + time : 2022-10-06_13:47:52 + host : jean-zay-iam23-ib0 + rank : 37 (local_rank: 5) + exitcode : 1 (pid: 2874078) + error_file: /tmp/torchelastic_dw_znvh1/none_heu976xc/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[6]: + time : 2022-10-06_13:47:52 + host : jean-zay-iam23-ib0 + rank : 38 (local_rank: 6) + exitcode : 1 (pid: 2874079) + error_file: /tmp/torchelastic_dw_znvh1/none_heu976xc/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[7]: + time : 2022-10-06_13:47:52 + host : jean-zay-iam23-ib0 + rank : 39 (local_rank: 7) + exitcode : 1 (pid: 2874080) + error_file: /tmp/torchelastic_dw_znvh1/none_heu976xc/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +------------------------------------------------------------ +Root Cause (first observed failure): +[2]: + time : 2022-10-06_13:47:52 + host : jean-zay-iam23-ib0 + rank : 34 (local_rank: 2) + exitcode : 1 (pid: 2874075) + error_file: /tmp/torchelastic_dw_znvh1/none_heu976xc/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +============================================================ + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam28-ib0 + rank : 57 (local_rank: 1) + exitcode : 1 (pid: 3917403) + error_file: /tmp/torchelastic_o651cfy4/none_aj6au2t3/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[2]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam28-ib0 + rank : 58 (local_rank: 2) + exitcode : 1 (pid: 3917404) + error_file: /tmp/torchelastic_o651cfy4/none_aj6au2t3/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[3]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam28-ib0 + rank : 59 (local_rank: 3) + exitcode : 1 (pid: 3917405) + error_file: /tmp/torchelastic_o651cfy4/none_aj6au2t3/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[4]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam28-ib0 + rank : 60 (local_rank: 4) + exitcode : 1 (pid: 3917406) + error_file: /tmp/torchelastic_o651cfy4/none_aj6au2t3/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[5]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam28-ib0 + rank : 61 (local_rank: 5) + exitcode : 1 (pid: 3917407) + error_file: /tmp/torchelastic_o651cfy4/none_aj6au2t3/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[6]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam28-ib0 + rank : 62 (local_rank: 6) + exitcode : 1 (pid: 3917408) + error_file: /tmp/torchelastic_o651cfy4/none_aj6au2t3/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[7]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam28-ib0 + rank : 63 (local_rank: 7) + exitcode : 1 (pid: 3917409) + error_file: /tmp/torchelastic_o651cfy4/none_aj6au2t3/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam28-ib0 + rank : 56 (local_rank: 0) + exitcode : 1 (pid: 3917402) + error_file: /tmp/torchelastic_o651cfy4/none_aj6au2t3/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +============================================================ + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[0]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam24-ib0 + rank : 40 (local_rank: 0) + exitcode : 1 (pid: 893477) + error_file: /tmp/torchelastic_4us6_8vz/none_rqccc2pz/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[1]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam24-ib0 + rank : 41 (local_rank: 1) + exitcode : 1 (pid: 893478) + error_file: /tmp/torchelastic_4us6_8vz/none_rqccc2pz/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[2]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam24-ib0 + rank : 42 (local_rank: 2) + exitcode : 1 (pid: 893479) + error_file: /tmp/torchelastic_4us6_8vz/none_rqccc2pz/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[3]: + time : 2022-10-06_13:47:54 + host : jean-zay-iam24-ib0 + rank : 43 (local_rank: 3) + exitcode : 1 (pid: 893480) + error_file: /tmp/torchelastic_4us6_8vz/none_rqccc2pz/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[5]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam24-ib0 + rank : 45 (local_rank: 5) + exitcode : 1 (pid: 893482) + error_file: /tmp/torchelastic_4us6_8vz/none_rqccc2pz/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[6]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam24-ib0 + rank : 46 (local_rank: 6) + exitcode : 1 (pid: 893483) + error_file: /tmp/torchelastic_4us6_8vz/none_rqccc2pz/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +[7]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam24-ib0 + rank : 47 (local_rank: 7) + exitcode : 1 (pid: 893484) + error_file: /tmp/torchelastic_4us6_8vz/none_rqccc2pz/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +------------------------------------------------------------ +Root Cause (first observed failure): +[4]: + time : 2022-10-06_13:47:53 + host : jean-zay-iam24-ib0 + rank : 44 (local_rank: 4) + exitcode : 1 (pid: 893481) + error_file: /tmp/torchelastic_4us6_8vz/none_rqccc2pz/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 451, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2640, in _load_checkpoint + self.load_module_state_dict(state_dict=checkpoint['module'], + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1340, in load_module_state_dict + self.module.load_state_dir(load_dir=self._curr_ckpt_path, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/module.py", line 604, in load_state_dir + layer.load_state_dict(checkpoint) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1604, in load_state_dict + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + RuntimeError: Error(s) in loading state_dict for ParallelTransformerLayerPipe: + size mismatch for input_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for input_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for self_attention.query_key_value.weight: copying a param with shape torch.Size([3072, 2048]) from checkpoint, the shape in current model is torch.Size([2304, 1536]). + size mismatch for self_attention.query_key_value.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]). + size mismatch for self_attention.dense.weight: copying a param with shape torch.Size([2048, 1024]) from checkpoint, the shape in current model is torch.Size([1536, 768]). + size mismatch for self_attention.dense.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.weight: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for post_attention_layernorm.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + size mismatch for mlp.dense_h_to_4h.weight: copying a param with shape torch.Size([4096, 2048]) from checkpoint, the shape in current model is torch.Size([3072, 1536]). + size mismatch for mlp.dense_h_to_4h.bias: copying a param with shape torch.Size([4096]) from checkpoint, the shape in current model is torch.Size([3072]). + size mismatch for mlp.dense_4h_to_h.weight: copying a param with shape torch.Size([2048, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 3072]). + size mismatch for mlp.dense_4h_to_h.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1536]). + +============================================================ +srun: error: jean-zay-iam10: task 0: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2075886.0 +srun: error: jean-zay-iam27: task 6: Exited with exit code 1 +srun: error: jean-zay-iam12: task 2: Exited with exit code 1 +srun: error: jean-zay-iam11: task 1: Exited with exit code 1 +srun: error: jean-zay-iam22: task 3: Exited with exit code 1 +srun: error: jean-zay-iam24: task 5: Exited with exit code 1 +srun: error: jean-zay-iam28: task 7: Exited with exit code 1 +srun: error: jean-zay-iam23: task 4: Exited with exit code 1 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default7]:> setting tensorboard ... +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]:using world size: 64, data-parallel-size: 16, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 16 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2076732.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 8192 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 2048 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 128 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 64 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 128 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-06 14:26:39,633] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default0]:> initializing tensor model parallel with size 2 +[default0]:> initializing pipeline model parallel with size 2 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-06 14:26:41,715] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.090 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 6.145 seconds +[default0]:time to initialize megatron (seconds): 7.952 +[default0]:[after megatron is initialized] datetime: 2022-10-06 14:26:47 +[default0]:building GPT model ... +[default0]:[2022-10-06 14:26:47,987] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-06 14:26:47,987] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-06 14:26:47,988] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.56 GB, percent = 6.7% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=1, model=0): 2, ProcessCoord(pipe=0, data=1, model=1): 3, ProcessCoord(pipe=0, data=2, model=0): 4, ProcessCoord(pipe=0, data=2, model=1): 5, ProcessCoord(pipe=0, data=3, model=0): 6, ProcessCoord(pipe=0, data=3, model=1): 7, ProcessCoord(pipe=0, data=4, model=0): 8, ProcessCoord(pipe=0, data=4, model=1): 9, ProcessCoord(pipe=0, data=5, model=0): 10, ProcessCoord(pipe=0, data=5, model=1): 11, ProcessCoord(pipe=0, data=6, model=0): 12, ProcessCoord(pipe=0, data=6, model=1): 13, ProcessCoord(pipe=0, data=7, model=0): 14, ProcessCoord(pipe=0, data=7, model=1): 15, ProcessCoord(pipe=0, data=8, model=0): 16, ProcessCoord(pipe=0, data=8, model=1): 17, ProcessCoord(pipe=0, data=9, model=0): 18, ProcessCoord(pipe=0, data=9, model=1): 19, ProcessCoord(pipe=0, data=10, model=0): 20, ProcessCoord(pipe=0, data=10, model=1): 21, ProcessCoord(pipe=0, data=11, model=0): 22, ProcessCoord(pipe=0, data=11, model=1): 23, ProcessCoord(pipe=0, data=12, model=0): 24, ProcessCoord(pipe=0, data=12, model=1): 25, ProcessCoord(pipe=0, data=13, model=0): 26, ProcessCoord(pipe=0, data=13, model=1): 27, ProcessCoord(pipe=0, data=14, model=0): 28, ProcessCoord(pipe=0, data=14, model=1): 29, ProcessCoord(pipe=0, data=15, model=0): 30, ProcessCoord(pipe=0, data=15, model=1): 31, ProcessCoord(pipe=1, data=0, model=0): 32, ProcessCoord(pipe=1, data=0, model=1): 33, ProcessCoord(pipe=1, data=1, model=0): 34, ProcessCoord(pipe=1, data=1, model=1): 35, ProcessCoord(pipe=1, data=2, model=0): 36, ProcessCoord(pipe=1, data=2, model=1): 37, ProcessCoord(pipe=1, data=3, model=0): 38, ProcessCoord(pipe=1, data=3, model=1): 39, ProcessCoord(pipe=1, data=4, model=0): 40, ProcessCoord(pipe=1, data=4, model=1): 41, ProcessCoord(pipe=1, data=5, model=0): 42, ProcessCoord(pipe=1, data=5, model=1): 43, ProcessCoord(pipe=1, data=6, model=0): 44, ProcessCoord(pipe=1, data=6, model=1): 45, ProcessCoord(pipe=1, data=7, model=0): 46, ProcessCoord(pipe=1, data=7, model=1): 47, ProcessCoord(pipe=1, data=8, model=0): 48, ProcessCoord(pipe=1, data=8, model=1): 49, ProcessCoord(pipe=1, data=9, model=0): 50, ProcessCoord(pipe=1, data=9, model=1): 51, ProcessCoord(pipe=1, data=10, model=0): 52, ProcessCoord(pipe=1, data=10, model=1): 53, ProcessCoord(pipe=1, data=11, model=0): 54, ProcessCoord(pipe=1, data=11, model=1): 55, ProcessCoord(pipe=1, data=12, model=0): 56, ProcessCoord(pipe=1, data=12, model=1): 57, ProcessCoord(pipe=1, data=13, model=0): 58, ProcessCoord(pipe=1, data=13, model=1): 59, ProcessCoord(pipe=1, data=14, model=0): 60, ProcessCoord(pipe=1, data=14, model=1): 61, ProcessCoord(pipe=1, data=15, model=0): 62, ProcessCoord(pipe=1, data=15, model=1): 63} +[default0]:[2022-10-06 14:26:48,853] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=15 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]:stage=1 layers=16 +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-06 14:26:49,650] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-06 14:26:49,651] [INFO] [utils.py:828:see_memory_usage] MA 1.04 GB Max_MA 1.04 GB CA 1.04 GB Max_CA 1 GB +[default0]:[2022-10-06 14:26:49,651] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.99 GB, percent = 6.8% +[default0]:setting training iterations to 3100 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-06 14:26:49,652] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-06 14:26:50,715] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default0]:[2022-10-06 14:26:50,715] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default0]:[2022-10-06 14:26:50,715] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default0]:[2022-10-06 14:26:50,718] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2022-10-06 14:26:50,718] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[default0]:[2022-10-06 14:26:50,718] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer +[default0]:[2022-10-06 14:26:50,718] [INFO] [stage_1_and_2.py:134:__init__] Reduce bucket size 500000000 +[default0]:[2022-10-06 14:26:50,718] [INFO] [stage_1_and_2.py:135:__init__] Allgather bucket size 500000000 +[default0]:[2022-10-06 14:26:50,718] [INFO] [stage_1_and_2.py:136:__init__] CPU Offload: False +[default0]:[2022-10-06 14:26:50,718] [INFO] [stage_1_and_2.py:137:__init__] Round robin gradient partitioning: False +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Emitting ninja build file /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113/utils/build.ninja... +[default0]:Building extension module utils... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default3]:Loading extension module utils... +[default6]:Loading extension module utils... +[default2]:Loading extension module utils... +[default4]:Loading extension module utils... +[default7]:Loading extension module utils... +[default1]:Loading extension module utils... +[default5]:Loading extension module utils... +[default0]:Loading extension module utils... +[default6]:Loading extension module utils... +[default7]:Loading extension module utils... +[default0]:Loading extension module utils... +[default2]:Loading extension module utils... +[default3]:Loading extension module utils... +[default1]:Loading extension module utils... +[default4]:Loading extension module utils... +[default2]:Loading extension module utils... +[default6]:Loading extension module utils... +[default1]:Loading extension module utils... +[default5]:Loading extension module utils... +[default3]:Loading extension module utils... +[default0]:Loading extension module utils... +[default7]:Loading extension module utils... +[default5]:Loading extension module utils... +[default4]:Loading extension module utils... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.36980557441711426 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.44371747970581055 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.36938023567199707 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.44347691535949707 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.4465501308441162 seconds +[default0]:ninja: no work to do. +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.3703181743621826 seconds +[default0]:Loading extension module utils... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.3709139823913574 seconds +[default0]:Time to load utils op: 0.44083642959594727 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.5364015102386475 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.4299333095550537 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.5363924503326416 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.43015551567077637 seconds +[default3]:Loading extension module utils... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.5363919734954834 seconds +[default3]:Time to load utils op: 0.42998337745666504 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.536431074142456 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4302544593811035 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.493013858795166 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.4266223907470703 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4144868850708008 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.5053396224975586 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.42500782012939453 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.5047900676727295 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.42508506774902344 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.41474103927612305 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.5027425289154053 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.5022039413452148 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.41521620750427246 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.41706252098083496 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.49282169342041016 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.41518592834472656 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.429166316986084 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.5432443618774414 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.5412874221801758 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.5441410541534424 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.43930506706237793 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.5437920093536377 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.4363870620727539 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4371066093444824 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.4930446147918701 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.49344706535339355 seconds +[default3]:Time to load utils op: 0.43055295944213867 seconds +[default6]:Time to load utils op: 0.5370094776153564 seconds +[default2]:Time to load utils op: 0.5370326042175293 seconds +[default4]:Time to load utils op: 0.5370175838470459 seconds +[default7]:Time to load utils op: 0.43120384216308594 seconds +[default1]:Time to load utils op: 0.4312317371368408 seconds +[default6]:Time to load utils op: 0.4877891540527344 seconds +[default7]:Time to load utils op: 0.41689395904541016 seconds +[default3]:Time to load utils op: 0.4192993640899658 seconds +[default0]:Time to load utils op: 0.48778295516967773 seconds +[default4]:Time to load utils op: 0.5369880199432373 seconds +[default2]:Time to load utils op: 0.48776721954345703 seconds +[default1]:Time to load utils op: 0.4187295436859131 seconds +[default2]:Time to load utils op: 0.5370054244995117 seconds +[default6]:Time to load utils op: 0.5370042324066162 seconds +[default1]:Time to load utils op: 0.4345879554748535 seconds +[default5]:Time to load utils op: 0.4320807456970215 seconds +[default3]:Time to load utils op: 0.4355349540710449 seconds +[default0]:Time to load utils op: 0.5369992256164551 seconds +[default5]:Time to load utils op: 0.43182992935180664 seconds +[default0]:Time to load utils op: 0.5370512008666992 seconds +[default7]:Time to load utils op: 0.4327216148376465 seconds +[default5]:Time to load utils op: 0.4165937900543213 seconds +[default4]:Time to load utils op: 0.48778438568115234 seconds +[default0]:Rank: 40 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 41 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 8 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 10 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 11 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 9 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 59 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 22 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 47 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 46 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 58 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 3 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 23 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 49 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 25 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 24 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 26 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 33 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 2 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 62 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 48 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 32 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 61 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 63 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 60 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 6 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 1 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 56 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 51 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 30 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 31 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 27 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 57 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 4 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 7 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 0 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 5 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 54 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 55 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 50 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 29 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 28 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 52 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 53 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 39 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 34 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 37 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 35 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 38 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 36 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 17 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 20 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 16 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 21 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 12 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 13 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 45 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 44 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 19 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 18 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 42 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 43 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 14 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 15 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0021181106567382812 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.002348184585571289 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0022194385528564453 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.002135753631591797 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0018515586853027344 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.001575469970703125 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0016431808471679688 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.001573801040649414 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.00353240966796875 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0020906925201416016 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0016264915466308594 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.003627300262451172 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0035867691040039062 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0018055438995361328 seconds +[default1]:Time to load utils op: 0.002003908157348633 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.002479076385498047 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0019466876983642578 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0030975341796875 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0009720325469970703 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0011303424835205078 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0012180805206298828 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.002145051956176758 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0014214515686035156 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.001802682876586914 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0010783672332763672 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0012106895446777344 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0013093948364257812 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.001451730728149414 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default4]:Time to load utils op: 0.002294778823852539 seconds +[default3]:Time to load utils op: 0.0015299320220947266 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0036971569061279297 seconds +[default0]:[2022-10-06 14:26:55,592] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states +[default0]:[2022-10-06 14:26:55,592] [INFO] [utils.py:828:see_memory_usage] MA 1.17 GB Max_MA 1.19 GB CA 1.79 GB Max_CA 2 GB +[default0]:[2022-10-06 14:26:55,592] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.8 GB, percent = 7.5% +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0023207664489746094 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.002238750457763672 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0022127628326416016 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0027277469635009766 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.002840757369995117 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.00213623046875 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0012481212615966797 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0015149116516113281 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0011260509490966797 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0007774829864501953 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0035042762756347656 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0007190704345703125 seconds +[default4]:Time to load utils op: 0.0013928413391113281 seconds +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0023686885833740234 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0008339881896972656 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0017595291137695312 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0008683204650878906 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.000598907470703125 seconds +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0007197856903076172 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0016880035400390625 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0015802383422851562 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0021114349365234375 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0015385150909423828 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0018625259399414062 seconds +[default4]:Time to load utils op: 0.001689910888671875 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.001520395278930664 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default5]:Time to load utils op: 0.0019941329956054688 seconds +[default6]:Time to load utils op: 0.0029938220977783203 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0032324790954589844 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0015730857849121094 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0016865730285644531 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.001672983169555664 seconds +[default0]:[2022-10-06 14:26:55,660] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states +[default0]:[2022-10-06 14:26:55,660] [INFO] [utils.py:828:see_memory_usage] MA 1.43 GB Max_MA 1.56 GB CA 2.14 GB Max_CA 2 GB +[default0]:[2022-10-06 14:26:55,660] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.8 GB, percent = 7.5% +[default0]:[2022-10-06 14:26:55,660] [INFO] [stage_1_and_2.py:516:__init__] optimizer state initialized +[default0]:[2022-10-06 14:26:55,681] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer +[default0]:[2022-10-06 14:26:55,682] [INFO] [utils.py:828:see_memory_usage] MA 1.43 GB Max_MA 1.43 GB CA 2.14 GB Max_CA 2 GB +[default0]:[2022-10-06 14:26:55,682] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.8 GB, percent = 7.5% +[default0]:[2022-10-06 14:26:55,682] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2022-10-06 14:26:55,682] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2022-10-06 14:26:55,682] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2022-10-06 14:26:55,682] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2022-10-06 14:26:55,682] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] gradient_accumulation_steps .. 128 +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2022-10-06 14:26:55,683] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] train_batch_size ............. 2048 +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 1 +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] world_size ................... 16 +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] zero_enabled ................. True +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:991:print] zero_optimization_stage ...... 1 +[default0]:[2022-10-06 14:26:55,684] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 1, +[default0]: "train_batch_size": 2.048000e+03, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 1 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0003936290740966797 seconds +[default0]:[2022-10-06 14:26:55,685] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=128 micro_batch_size=1 +[default1]:[2022-10-06 14:26:56,165] [INFO] [engine.py:145:__init__] RANK=33 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default0]:[2022-10-06 14:26:56,165] [INFO] [engine.py:145:__init__] RANK=32 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default1]:[2022-10-06 14:26:56,165] [INFO] [engine.py:145:__init__] RANK=1 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default0]:[2022-10-06 14:26:56,165] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default6]:[2022-10-06 14:26:56,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 14:26:56,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 14:26:56,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 14:26:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 14:26:56,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:56,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 14:26:56,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 14:26:56,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 14:26:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 14:26:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 14:26:56,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 14:26:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 14:26:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 14:26:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 14:26:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 14:26:56,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 14:26:56,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 14:26:56,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 14:26:56,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 14:26:56,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:56,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 14:26:56,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 14:26:56,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 14:26:56,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 14:26:56,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 14:26:56,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 14:26:56,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 14:26:56,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 14:26:56,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 14:26:56,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 14:26:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 14:26:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 14:26:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 14:26:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 14:26:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 14:26:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 14:26:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 14:26:56,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:56,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:57,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:26:57,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:26:57,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:57,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 14:26:57,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 14:26:57,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default0]:[2022-10-06 14:26:57,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:26:57,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default0]:[2022-10-06 14:26:57,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:26:57,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default3]:[2022-10-06 14:26:57,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 14:26:57,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-06 14:26:57,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 14:26:57,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 14:26:57,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default2]:[2022-10-06 14:26:57,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:57,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:26:57,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 14:26:58,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 14:26:58,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default1]:[2022-10-06 14:26:58,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 14:26:58,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default0]:[2022-10-06 14:26:58,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 14:26:58,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 14:26:58,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 14:26:58,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 14:26:58,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 14:26:58,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default7]:[2022-10-06 14:26:58,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 14:26:58,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default6]:[2022-10-06 14:26:58,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 14:26:58,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 14:26:58,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 14:26:58,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 14:26:58,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default3]:[2022-10-06 14:26:58,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default2]:[2022-10-06 14:26:58,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:26:58,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default4]:[2022-10-06 14:26:58,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:26:58,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:26:58,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default2]:[2022-10-06 14:26:58,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:26:58,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default2]:[2022-10-06 14:26:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:26:58,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-06 14:26:58,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 14:26:58,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-06 14:26:58,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default1]:[2022-10-06 14:26:58,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 14:26:58,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default5]:[2022-10-06 14:26:58,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 14:26:58,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default6]:[2022-10-06 14:26:58,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 14:26:58,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 14:26:58,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default6]:[2022-10-06 14:26:58,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default4]:[2022-10-06 14:26:58,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:26:58,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default3]:[2022-10-06 14:26:58,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 14:26:58,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-06 14:26:58,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 14:26:58,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-06 14:26:58,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:26:58,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 14:26:58,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 14:26:58,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default0]:[2022-10-06 14:26:58,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:26:58,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-06 14:26:58,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 14:26:58,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default2]:[2022-10-06 14:26:58,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:26:58,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 14:26:58,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:26:58,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default6]:[2022-10-06 14:26:58,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 14:26:58,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 14:26:58,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default1]:[2022-10-06 14:26:58,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 14:26:58,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default7]:[2022-10-06 14:26:58,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 14:26:58,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-06 14:26:58,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:26:58,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default6]:[2022-10-06 14:26:58,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default6]:[2022-10-06 14:26:58,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 14:26:58,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default6]:[2022-10-06 14:26:58,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 14:26:58,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 14:26:58,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:26:58,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 14:26:58,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 14:26:58,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default5]:[2022-10-06 14:26:58,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 14:26:58,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default7]:[2022-10-06 14:26:58,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 14:26:58,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 14:26:58,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 14:26:58,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-06 14:26:58,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:26:58,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 14:26:58,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default6]:[2022-10-06 14:26:58,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 14:26:58,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 14:26:58,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default2]:[2022-10-06 14:26:58,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:26:58,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 14:26:58,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:26:58,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default5]:[2022-10-06 14:26:59,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 14:26:59,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default4]:[2022-10-06 14:26:59,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:26:59,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 14:26:59,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 14:26:59,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default5]:[2022-10-06 14:26:59,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 14:26:59,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default5]:[2022-10-06 14:26:59,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 14:26:59,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default4]:[2022-10-06 14:26:59,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:26:59,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 14:26:59,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 14:26:59,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default7]:[2022-10-06 14:26:59,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 14:26:59,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default7]:[2022-10-06 14:26:59,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 14:26:59,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default4]:[2022-10-06 14:26:59,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:26:59,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default2]:[2022-10-06 14:26:59,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:26:59,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:00,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:27:00,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:27:00,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:00,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:27:00,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:27:00,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:00,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:00,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-06 14:27:00,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:00,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-06 14:27:00,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:00,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-06 14:27:00,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:00,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:00,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:00,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-06 14:27:00,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:00,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:00,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:00,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:00,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:00,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:00,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:00,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-06 14:27:00,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:00,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:00,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:00,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:00,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:00,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:00,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:00,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:00,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:00,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:00,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:00,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:00,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:00,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:00,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:00,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:00,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:00,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:00,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:00,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:00,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:00,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:00,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:00,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:00,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:00,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-06 14:27:01,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-06 14:27:01,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-06 14:27:01,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:01,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default2]:[2022-10-06 14:27:01,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default0]:[2022-10-06 14:27:01,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-06 14:27:01,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:01,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-06 14:27:01,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:01,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:27:01,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:27:01,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 14:27:01,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:01,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:27:01,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:01,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default2]:[2022-10-06 14:27:01,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:01,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-06 14:27:01,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-06 14:27:01,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-06 14:27:01,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-06 14:27:01,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-06 14:27:01,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-06 14:27:01,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-06 14:27:01,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-06 14:27:01,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:01,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:01,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default2]:[2022-10-06 14:27:01,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-06 14:27:01,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-06 14:27:01,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-06 14:27:01,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-06 14:27:01,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-06 14:27:01,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default3]:[2022-10-06 14:27:01,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:01,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:01,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:01,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-06 14:27:01,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:01,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-06 14:27:01,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:01,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-06 14:27:01,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-06 14:27:01,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:01,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:01,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:01,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-06 14:27:01,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:01,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default2]:[2022-10-06 14:27:01,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default2]:[2022-10-06 14:27:01,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:01,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:01,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:01,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-06 14:27:01,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-06 14:27:01,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:01,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:01,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:01,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default2]:[2022-10-06 14:27:01,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-06 14:27:01,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:01,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:01,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default2]:[2022-10-06 14:27:01,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:01,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:27:01,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-06 14:27:01,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:01,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:01,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:01,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:01,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:01,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-06 14:27:01,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:01,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-06 14:27:01,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:01,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:01,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:01,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:01,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:01,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:01,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:01,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:01,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:01,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:01,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default2]:[2022-10-06 14:27:01,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:01,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:01,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:01,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:01,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:01,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:01,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:01,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-06 14:27:02,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:01,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:01,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:01,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:01,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:01,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,033] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:01,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:01,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:01,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:01,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:01,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:01,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:01,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-06 14:27:02,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-06 14:27:02,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-06 14:27:02,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:01,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default2]:[2022-10-06 14:27:01,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:01,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:01,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:01,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:01,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:01,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:02,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:02,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:27:02,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:02,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 14:27:02,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:02,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 14:27:02,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:02,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:02,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:02,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:02,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:02,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:02,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-06 14:27:02,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default6]:[2022-10-06 14:27:02,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:02,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:02,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 14:27:02,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-06 14:27:02,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,299] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:02,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:02,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:02,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:02,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-06 14:27:02,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-06 14:27:02,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-06 14:27:02,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-06 14:27:02,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:02,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:02,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 14:27:02,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:02,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:02,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,492] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 14:27:02,492] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 14:27:02,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-06 14:27:02,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,549] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-06 14:27:02,549] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-06 14:27:02,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,548] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-06 14:27:02,548] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default2]:[2022-10-06 14:27:02,521] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 14:27:02,521] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 14:27:02,549] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-06 14:27:02,549] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-06 14:27:02,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,548] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-06 14:27:02,548] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default0]:[2022-10-06 14:27:02,493] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-06 14:27:02,493] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-06 14:27:02,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 14:27:02,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,573] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-06 14:27:02,574] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-06 14:27:02,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:02,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,517] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-06 14:27:02,518] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default3]:[2022-10-06 14:27:02,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,558] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-06 14:27:02,559] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-06 14:27:02,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default6]:[2022-10-06 14:27:02,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:02,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default2]:[2022-10-06 14:27:02,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:02,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default2]:[2022-10-06 14:27:02,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,521] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-06 14:27:02,521] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-06 14:27:02,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,517] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-06 14:27:02,517] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 14:27:02,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,558] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-06 14:27:02,558] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-06 14:27:02,572] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-06 14:27:02,573] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-06 14:27:02,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:02,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:02,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default2]:[2022-10-06 14:27:02,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:02,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,669] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 14:27:02,670] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 14:27:02,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,648] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-06 14:27:02,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,648] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default4]:[2022-10-06 14:27:02,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,612] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 14:27:02,612] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 14:27:02,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default6]:[2022-10-06 14:27:02,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,648] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-06 14:27:02,649] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-06 14:27:02,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:02,670] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-06 14:27:02,670] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-06 14:27:02,612] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-06 14:27:02,612] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default5]:[2022-10-06 14:27:02,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,611] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-06 14:27:02,611] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-06 14:27:02,670] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-06 14:27:02,671] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default6]:[2022-10-06 14:27:02,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:02,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,616] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoint[default3]:[2022-10-06 14:27:02,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +s/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-06 14:27:02,616] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmi[default3]:[2022-10-06 14:27:02,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +xnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-06 14:27:02,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,661] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-06 14:27:02,661] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-06 14:27:02,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,615] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-06 14:27:02,615] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 14:27:02,661] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 14:27:02,661] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 14:27:02,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,704] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-06 14:27:02,704] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default0]:[2022-10-06 14:27:02,611] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 14:27:02,611] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:could not find arguments in the checkpoint ... +[default0]: checkpoint version 3.0 +[default7]:[2022-10-06 14:27:02,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,670] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-06 14:27:02,670] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-06 14:27:02,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,705] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-06 14:27:02,705] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-06 14:27:02,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:02,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:02,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default6]:[2022-10-06 14:27:02,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default0]:[2022-10-06 14:27:02,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,794] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-06 14:27:02,794] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-06 14:27:02,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:02,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,795] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-06 14:27:02,795] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default4]:[2022-10-06 14:27:02,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default6]:[2022-10-06 14:27:02,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:02,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:02,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:02,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:02,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:02,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:02,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:02,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:02,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:02,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:02,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:02,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:02,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:02,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:02,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:02,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:02,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:02,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:02,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:03,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:03,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:02,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,026] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/globa[default4]:[2022-10-06 14:27:03,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +l_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-06 14:27:03,026] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-06 14:27:02,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:02,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:03,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:02,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:02,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:03,027] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-06 14:27:03,027] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-06 14:27:03,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:02,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:02,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:02,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:02,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:03,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:03,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:03,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:03,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default2]:[2022-10-06 14:27:03,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:02,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:02,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:03,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:03,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:03,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:03,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:03,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default0]:[2022-10-06 14:27:03,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:03,138] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-06 14:27:03,138] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-06 14:27:03,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,168] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-06 14:27:03,168] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-06 14:27:03,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:03,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,138] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-06 14:27:03,138] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,158] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,159] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-06 14:27:03,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,168] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-06 14:27:03,168] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,157] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,157] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:03,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:03,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:03,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:03,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:03,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:03,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:03,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:03,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:03,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:03,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:03,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:03,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:03,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:03,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:03,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,253] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-06 14:27:03,254] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,254] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-06 14:27:03,254] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 14:27:03,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,276] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,277] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-06 14:27:03,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:03,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:03,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:03,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:03,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:03,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:03,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:03,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:03,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-06 14:27:03,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:03,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 14:27:03,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 14:27:03,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default6]:[2022-10-06 14:27:03,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,253] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-06 14:27:03,253] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 14:27:03,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default4]:[2022-10-06 14:27:03,277] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,277] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-06 14:27:03,254] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-06 14:27:03,254] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 14:27:03,324] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,324] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-06 14:27:03,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 14:27:03,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 14:27:03,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default4]:[2022-10-06 14:27:03,324] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-06 14:27:03,325] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-06 14:27:03,379] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-06 14:27:03,379] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-06 14:27:03,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:03,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:03,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:03,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:03,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:03,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-06 14:27:03,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:03,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 14:27:03,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 14:27:03,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default5]:[2022-10-06 14:27:03,347] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-06 14:27:03,348] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default2]:[2022-10-06 14:27:03,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 14:27:03,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 14:27:03,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-06 14:27:03,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default5]:[2022-10-06 14:27:03,347] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-06 14:27:03,347] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default0]:[2022-10-06 14:27:03,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default6]:[2022-10-06 14:27:03,378] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-06 14:27:03,378] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 14:27:03,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 14:27:03,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default7]:[2022-10-06 14:27:03,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default7]:[2022-10-06 14:27:03,459] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-06 14:27:03,459] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-06 14:27:03,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default1]:[2022-10-06 14:27:03,456] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-06 14:27:03,457] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default2]:[2022-10-06 14:27:03,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 14:27:03,460] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-06 14:27:03,460] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-06 14:27:03,409] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-06 14:27:03,409] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default0]:[2022-10-06 14:27:03,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 14:27:03,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 14:27:03,478] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 14:27:03,478] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 14:27:03,398] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-06 14:27:03,398] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-06 14:27:03,457] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-06 14:27:03,457] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-06 14:27:03,399] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-06 14:27:03,400] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-06 14:27:03,409] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-06 14:27:03,410] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default0]:[2022-10-06 14:27:03,479] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-06 14:27:03,479] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-06 14:27:03,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 14:27:03,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 14:27:03,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]: successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq at iteration 0 +[default2]:[2022-10-06 14:27:03,700] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 14:27:03,700] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 14:27:03,700] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-06 14:27:03,701] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:time (ms) | load-checkpoint: 7396.26 +[default0]:estimated model parameters: 2.236514304 +[default0]:estimated model parameters without embeddings: 1.208909824 +[default0]:[after model, optimizer, and learning rate scheduler are built] datetime: 2022-10-06 14:27:03 +[default0]:> building train, validation, and test datasets ... +[default0]: > datasets target sizes (minimum size): +[default0]: train: 6348800 +[default0]: validation: 512000 +[default0]: test: 20480 +[default0]:> building train, validation, and test datasets for T0 ... +[default0]: > building dataset index ... +[default0]:/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/utils.py:365: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +[default0]: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.124352 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 29920425) total of 29920425 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051868 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003322 seconds +[default0]: number of documents: 31495184 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.096 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.074083 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4893782) total of 4893782 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036241 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003116 seconds +[default0]: number of documents: 5151349 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.055 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014762 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3384633) total of 3384633 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.047825 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005311 seconds +[default0]: number of documents: 3562772 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.047 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.026241 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2572338) total of 2572338 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.108721 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002986 seconds +[default0]: number of documents: 2707724 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044768 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4803145) total of 4803145 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.068956 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003284 seconds +[default0]: number of documents: 5055942 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.046 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.082201 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2041507) total of 2041507 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044895 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002643 seconds +[default0]: number of documents: 2148955 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.015 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.077254 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2496022) total of 2496022 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051920 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002636 seconds +[default0]: number of documents: 2627392 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.034 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.022985 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3382528) total of 3382528 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.092226 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003010 seconds +[default0]: number of documents: 3560556 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.046 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.025755 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1466269) total of 1466269 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.059577 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002458 seconds +[default0]: number of documents: 1543441 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.022 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.072423 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1583941) total of 1583941 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.028321 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003261 seconds +[default0]: number of documents: 1667306 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.016 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.041934 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 812968) total of 812968 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049037 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003184 seconds +[default0]: number of documents: 855756 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045713 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 544696) total of 544696 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036724 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001578 seconds +[default0]: number of documents: 573364 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.031742 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 390101) total of 390101 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.042157 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000763 seconds +[default0]: number of documents: 410633 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.040575 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 407401) total of 407401 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033567 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001210 seconds +[default0]: number of documents: 428843 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.008 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045464 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 396406) total of 396406 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048378 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001104 seconds +[default0]: number of documents: 417269 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.056691 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1058732) total of 1058732 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044890 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003898 seconds +[default0]: number of documents: 1114455 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032947 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 330124) total of 330124 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.042591 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001057 seconds +[default0]: number of documents: 347499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.034672 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 322250) total of 322250 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.043507 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000814 seconds +[default0]: number of documents: 339210 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004821 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 299966) total of 299966 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033476 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000838 seconds +[default0]: number of documents: 315754 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049356 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 872495) total of 872495 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045378 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003310 seconds +[default0]: number of documents: 918416 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.018 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.034074 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 902592) total of 902592 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046501 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003519 seconds +[default0]: number of documents: 950097 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045857 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869310) total of 869310 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.062270 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003701 seconds +[default0]: number of documents: 915063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.027257 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869308) total of 869308 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.054893 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003074 seconds +[default0]: number of documents: 915061 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.031 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049531 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869305) total of 869305 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051761 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003501 seconds +[default0]: number of documents: 915058 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.028 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049850 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 821803) total of 821803 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044699 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003845 seconds +[default0]: number of documents: 865056 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.036 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032823 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869292) total of 869292 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.027116 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003203 seconds +[default0]: number of documents: 915044 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.017 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046457 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869291) total of 869291 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049816 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003884 seconds +[default0]: number of documents: 915043 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.033 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.041370 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869270) total of 869270 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044881 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002940 seconds +[default0]: number of documents: 915021 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049386 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869301) total of 869301 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038501 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002901 seconds +[default0]: number of documents: 915054 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.025 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.052102 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869298) total of 869298 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.047170 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003726 seconds +[default0]: number of documents: 915051 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.033 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.029375 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 302280) total of 302280 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036043 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000888 seconds +[default0]: number of documents: 318189 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.030486 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 252571) total of 252571 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.031246 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000619 seconds +[default0]: number of documents: 265864 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.014 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.031562 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.042767 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000673 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033217 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033199 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000914 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.041497 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.041488 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000736 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.039413 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346807) total of 346807 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004702 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001402 seconds +[default0]: number of documents: 365060 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006280 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346810) total of 346810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.026946 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000811 seconds +[default0]: number of documents: 365063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.034316 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004467 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000674 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.029501 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036757 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000597 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.019631 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005174 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000934 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.016 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.030517 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033190 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000735 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033242 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 257631) total of 257631 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.028849 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000727 seconds +[default0]: number of documents: 271191 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.042306 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 256474) total of 256474 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.062043 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000684 seconds +[default0]: number of documents: 269973 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032964 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.031405 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000985 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003760 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032718 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000593 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.029382 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.030294 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000624 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387164 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786753 +[default0]: dataset 2, input: 0.0636898, achieved: 0.0636898 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584986 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576332 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485994 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478619 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476917 +[default0]: dataset 8, input: 0.045653, achieved: 0.045653 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322254 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199319 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138497 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960602 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865244 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692258 +[default0]: dataset 15, input: 0.00582803, achieved: 0.00582806 +[default0]: dataset 16, input: 0.00582586, achieved: 0.00582586 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543682 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409057 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366564 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337937 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282753 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00274012 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264622 +[default0]: dataset 24, input: 0.00262358, achieved: 0.0026236 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00260032 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259097 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245155 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244736 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238686 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200525 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181879 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171917 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167776 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162355 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131238 +[default0]: dataset 36, input: 0.00127347, achieved: 0.00127344 +[default0]: dataset 37, input: 0.00120564, achieved: 0.00120569 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119529 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118536 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117487 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00114929 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112334 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112315 +[default0]: dataset 44, input: 0.00111237, achieved: 0.00111236 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110444 +[default0]:> elapsed time for building blendable dataset indices: 0.50 (sec) +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.153039 seconds +[default0]: number of documents: 15234080 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [14472376, 15234080) total of 761704 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.077 seconds +[default0]: total number of samples: 221750 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.100196 seconds +[default0]: number of documents: 6142390 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [5835270, 6142390) total of 307120 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.078 seconds +[default0]: total number of samples: 136143 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.149825 seconds +[default0]: number of documents: 26176998 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [24868148, 26176998) total of 1308850 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.103 seconds +[default0]: total number of samples: 432311 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.088376 seconds +[default0]: number of documents: 20844665 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [19802432, 20844665) total of 1042233 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.092 seconds +[default0]: total number of samples: 521545 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.154916 seconds +[default0]: number of documents: 67005817 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [63655526, 67005817) total of 3350291 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.129 seconds +[default0]: total number of samples: 1740321 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.083801 seconds +[default0]: number of documents: 5149795 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4892305, 5149795) total of 257490 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.044 seconds +[default0]: total number of samples: 26370 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.057925 seconds +[default0]: number of documents: 58847091 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [55904736, 58847091) total of 2942355 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.126 seconds +[default0]: total number of samples: 1458654 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.084545 seconds +[default0]: number of documents: 12514253 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [11888540, 12514253) total of 625713 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.064 seconds +[default0]: total number of samples: 134071 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.054989 seconds +[default0]: number of documents: 180608 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [171578, 180608) total of 9030 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: total number of samples: 2501 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.069281 seconds +[default0]: number of documents: 12303134 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [11687977, 12303134) total of 615157 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.046 seconds +[default0]: total number of samples: 157244 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.075806 seconds +[default0]: number of documents: 2033057 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [1931404, 2033057) total of 101653 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.037 seconds +[default0]: total number of samples: 20517 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.066553 seconds +[default0]: number of documents: 26793553 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [25453875, 26793553) total of 1339678 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.067 seconds +[default0]: total number of samples: 101502 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.083435 seconds +[default0]: number of documents: 3155990 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2998190, 3155990) total of 157800 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.033 seconds +[default0]: total number of samples: 44182 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.102091 seconds +[default0]: number of documents: 6692522 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [6357896, 6692522) total of 334626 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.030 seconds +[default0]: total number of samples: 47613 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.107630 seconds +[default0]: number of documents: 3017261 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2866398, 3017261) total of 150863 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: total number of samples: 29298 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.084139 seconds +[default0]: number of documents: 3648041 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [3465639, 3648041) total of 182402 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: total number of samples: 5659 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.109308 seconds +[default0]: number of documents: 4327282 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4110918, 4327282) total of 216364 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.023 seconds +[default0]: total number of samples: 12423 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.112855 seconds +[default0]: number of documents: 2698896 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2563951, 2698896) total of 134945 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.044 seconds +[default0]: total number of samples: 19133 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.128164 seconds +[default0]: number of documents: 12767593 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [12129213, 12767593) total of 638380 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.069 seconds +[default0]: total number of samples: 87928 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.102381 seconds +[default0]: number of documents: 4342323 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4125207, 4342323) total of 217116 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.048 seconds +[default0]: total number of samples: 69780 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.077112 seconds +[default0]: number of documents: 3022722 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2871586, 3022722) total of 151136 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: total number of samples: 22532 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.079508 seconds +[default0]: number of documents: 1162568 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [1104440, 1162568) total of 58128 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: total number of samples: 1608 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.053068 seconds +[default0]: number of documents: 55294645 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [52529913, 55294645) total of 2764732 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.112 seconds +[default0]: total number of samples: 690621 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.055253 seconds +[default0]: number of documents: 44855616 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [42612835, 44855616) total of 2242781 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.093 seconds +[default0]: total number of samples: 468689 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.092597 seconds +[default0]: number of documents: 31969891 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [30371396, 31969891) total of 1598495 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.086 seconds +[default0]: total number of samples: 497625 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.075733 seconds +[default0]: number of documents: 34110375 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [32404856, 34110375) total of 1705519 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.067 seconds +[default0]: total number of samples: 125120 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.175030 seconds +[default0]: number of documents: 43761623 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [41573542, 43761623) total of 2188081 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.106 seconds +[default0]: total number of samples: 1010592 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045993 seconds +[default0]: number of documents: 197602 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [187722, 197602) total of 9880 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: total number of samples: 4451 +[default0]: total number of epochs: 1 +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.0330676, achieved: 0.0330676 +[default0]: dataset 1, input: 0.0112421, achieved: 0.0112421 +[default0]: dataset 2, input: 0.130272, achieved: 0.130272 +[default0]: dataset 3, input: 0.221712, achieved: 0.221712 +[default0]: dataset 4, input: 0.106678, achieved: 0.106678 +[default0]: dataset 5, input: 0.00155951, achieved: 0.00155955 +[default0]: dataset 6, input: 0.13054, achieved: 0.13054 +[default0]: dataset 7, input: 0.010918, achieved: 0.0109181 +[default0]: dataset 8, input: 0.000110214, achieved: 0.000110257 +[default0]: dataset 9, input: 0.00549238, achieved: 0.00549235 +[default0]: dataset 10, input: 0.000402122, achieved: 0.000402094 +[default0]: dataset 11, input: 0.00747007, achieved: 0.00747007 +[default0]: dataset 12, input: 0.000619047, achieved: 0.000619024 +[default0]: dataset 13, input: 0.00103353, achieved: 0.0010336 +[default0]: dataset 14, input: 0.000501201, achieved: 0.000501226 +[default0]: dataset 15, input: 0.000667277, achieved: 0.000667231 +[default0]: dataset 16, input: 0.000359281, achieved: 0.000359326 +[default0]: dataset 17, input: 0.000508443, achieved: 0.000508519 +[default0]: dataset 18, input: 0.00211373, achieved: 0.0021138 +[default0]: dataset 19, input: 0.000912995, achieved: 0.000912961 +[default0]: dataset 20, input: 0.00124543, achieved: 0.00124546 +[default0]: dataset 21, input: 0.000315887, achieved: 0.00031594 +[default0]: dataset 22, input: 0.0813721, achieved: 0.0813721 +[default0]: dataset 23, input: 0.0552939, achieved: 0.0552939 +[default0]: dataset 24, input: 0.0495415, achieved: 0.0495414 +[default0]: dataset 25, input: 0.0246164, achieved: 0.0246163 +[default0]: dataset 26, input: 0.120917, achieved: 0.120917 +[default0]: dataset 27, input: 0.000517703, achieved: 0.000517666 +[default0]:> elapsed time for building blendable dataset indices: 0.31 (sec) +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008301 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [29920425, 31495184) total of 1574759 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006888 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003293 seconds +[default0]: number of documents: 31495184 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_199220ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_199220ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.101 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008136 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4893782, 5151349) total of 257567 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008217 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002258 seconds +[default0]: number of documents: 5151349 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_40484ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_40484ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.052 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009724 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3384633, 3562772) total of 178139 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006757 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005374 seconds +[default0]: number of documents: 3562772 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_32773ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_32773ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.044 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009887 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2572338, 2707724) total of 135386 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010032 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003319 seconds +[default0]: number of documents: 2707724 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_30102ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_30102ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.035 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009798 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4803145, 5055942) total of 252797 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009136 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003278 seconds +[default0]: number of documents: 5055942 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_29656ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_29656ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.045 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008893 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2041507, 2148955) total of 107448 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008745 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004058 seconds +[default0]: number of documents: 2148955 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_25008ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_25008ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.044 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008227 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2496022, 2627392) total of 131370 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008308 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004132 seconds +[default0]: number of documents: 2627392 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_24628ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_24628ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.049 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009530 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3382528, 3560556) total of 178028 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013267 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003615 seconds +[default0]: number of documents: 3560556 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_24541ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_24541ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.034 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012665 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1466269, 1543441) total of 77172 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010158 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003026 seconds +[default0]: number of documents: 1543441 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_23492ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_23492ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011760 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1583941, 1667306) total of 83365 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008634 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003124 seconds +[default0]: number of documents: 1667306 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_16582ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_16582ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.023 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012120 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [812968, 855756) total of 42788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007599 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003999 seconds +[default0]: number of documents: 855756 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_10257ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_10257ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006287 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [544696, 573364) total of 28668 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006567 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002229 seconds +[default0]: number of documents: 573364 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_7127ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_7127ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.040 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005771 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [390101, 410633) total of 20532 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004426 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001227 seconds +[default0]: number of documents: 410633 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_4943ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_4943ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.008 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004704 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [407401, 428843) total of 21442 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004074 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001407 seconds +[default0]: number of documents: 428843 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_4453ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_4453ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.018728 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [396406, 417269) total of 20863 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003176 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000769 seconds +[default0]: number of documents: 417269 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_3563ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_3563ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.026 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007480 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1058732, 1114455) total of 55723 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008440 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003124 seconds +[default0]: number of documents: 1114455 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_2999ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_2999ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.022 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004479 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [330124, 347499) total of 17375 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005104 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001577 seconds +[default0]: number of documents: 347499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_2998ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_2998ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004560 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [322250, 339210) total of 16960 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003407 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000706 seconds +[default0]: number of documents: 339210 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_2798ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_2798ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.026 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004521 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [299966, 315754) total of 15788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002857 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000896 seconds +[default0]: number of documents: 315754 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_2105ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_2105ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009541 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [872495, 918416) total of 45921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006838 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002822 seconds +[default0]: number of documents: 918416 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_1887ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_1887ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008019 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [902592, 950097) total of 47505 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008659 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003122 seconds +[default0]: number of documents: 950097 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_1739ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_1739ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.022 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010786 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869310, 915063) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009455 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003745 seconds +[default0]: number of documents: 915063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_1455ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_1455ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011118 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869308, 915061) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007241 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004884 seconds +[default0]: number of documents: 915061 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_1410ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_1410ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009043 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869305, 915058) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011950 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003465 seconds +[default0]: number of documents: 915058 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_1362ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_1362ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008166 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [821803, 865056) total of 43253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008909 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003348 seconds +[default0]: number of documents: 865056 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_1350ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_1350ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011426 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869292, 915044) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010719 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003301 seconds +[default0]: number of documents: 915044 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_1339ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_1339ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.014 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012645 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869291, 915043) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010202 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003013 seconds +[default0]: number of documents: 915043 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_1334ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_1334ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008580 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869270, 915021) total of 45751 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008402 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003291 seconds +[default0]: number of documents: 915021 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_1262ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_1262ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013594 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869301, 915054) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007850 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002981 seconds +[default0]: number of documents: 915054 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_1260ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_1260ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.012 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013000 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869298, 915051) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007135 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003447 seconds +[default0]: number of documents: 915051 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_1229ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_1229ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.018 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003486 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [302280, 318189) total of 15909 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002712 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001008 seconds +[default0]: number of documents: 318189 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_1032ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_1032ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004120 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [252571, 265864) total of 13293 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004086 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000889 seconds +[default0]: number of documents: 265864 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_936ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_936ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.033 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038677 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002248 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000989 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_885ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_885ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002538 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003335 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000728 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_864ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_864ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003086 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002851 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000598 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_836ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_836ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004893 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346807, 365060) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004229 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001338 seconds +[default0]: number of documents: 365060 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_676ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_676ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005259 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346810, 365063) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004490 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001639 seconds +[default0]: number of documents: 365063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_656ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_656ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003238 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002467 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000599 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_621ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_621ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.018 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004251 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002755 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000861 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003515 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002458 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000532 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_610ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_610ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003999 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003338 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000866 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_605ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_605ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003141 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [257631, 271191) total of 13560 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002540 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000728 seconds +[default0]: number of documents: 271191 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_592ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_592ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002208 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [256474, 269973) total of 13499 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012447 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000547 seconds +[default0]: number of documents: 269973 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_579ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_579ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013063 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002340 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000744 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_578ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_578ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.038 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002689 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002606 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000811 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_573ns_42s_decoder_packed_batch_idx.npy +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_573ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.033 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006379 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008574 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000526 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_569ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_569ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387163 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786745 +[default0]: dataset 2, input: 0.0636898, achieved: 0.0636904 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584976 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576328 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485991 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478619 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476916 +[default0]: dataset 8, input: 0.045653, achieved: 0.0456537 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322257 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199317 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138502 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960574 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865232 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692254 +[default0]: dataset 15, input: 0.00582803, achieved: 0.0058278 +[default0]: dataset 16, input: 0.00582586, achieved: 0.0058261 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543622 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409121 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366557 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337955 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282792 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00273939 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264575 +[default0]: dataset 24, input: 0.00262358, achieved: 0.00262362 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00259978 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259127 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245166 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244826 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238696 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200559 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181831 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171957 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167871 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162423 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131266 +[default0]: dataset 36, input: 0.00127347, achieved: 0.0012735 +[default0]: dataset 37, input: 0.00120564, achieved: 0.0012054 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119518 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118497 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117475 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00114922 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112368 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112368 +[default0]: dataset 44, input: 0.00111237, achieved: 0.00111176 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110495 +[default0]:> elapsed time for building blendable dataset indices: 0.03 (sec) +[default0]:> finished creating T0 datasets ... +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default1]:[001-001] 2.2365B / 1.2089B +[default7]:time (ms) | model-and-optimizer-setup: 15844.36 | train/valid/test-data-iterators-setup: 15423.37 +[default0]:[after dataloaders are built] datetime: 2022-10-06 14:27:20 +[default1]:[001-000] 2.2365B / 1.2089B +[default0]:done with setup ... +[default0]:training ... +[default0]:Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +[default0]:[000-000] 2.2365B / 1.2089B +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default0]:[000-001] 2.2365B / 1.2089B +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default5]: iteration = train(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default5]: train_step(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default5]: loss = model[0].train_batch(data_iter=data_iterator) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default5]: self._exec_schedule(sched) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default5]: self._exec_instr(**cmd.kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default5]: batch = self._next_batch() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default5]: batch = self.batch_fn(batch) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default5]: if 'text' in data: +[default5]:TypeError: argument of type 'NoneType' is not iterable +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default5]: iteration = train(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default5]: train_step(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default5]: loss = model[0].train_batch(data_iter=data_iterator) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default5]: self._exec_schedule(sched) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default5]: self._exec_instr(**cmd.kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default5]: batch = self._next_batch() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default5]: batch = self.batch_fn(batch) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default5]: if 'text' in data: +[default5]:TypeError: argument of type 'NoneType' is not iterable +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default7]: iteration = train(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default7]: train_step(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default7]: loss = model[0].train_batch(data_iter=data_iterator) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default7]: self._exec_schedule(sched) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default7]: self._exec_instr(**cmd.kwargs) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default7]: batch = self._next_batch() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default7]: batch = self.batch_fn(batch) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default7]: if 'text' in data: +[default7]:TypeError: argument of type 'NoneType' is not iterable +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default5]: iteration = train(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default5]: train_step(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default5]: loss = model[0].train_batch(data_iter=data_iterator) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default5]: self._exec_schedule(sched) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default5]: self._exec_instr(**cmd.kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default5]: batch = self._next_batch() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default5]: batch = self.batch_fn(batch) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default5]: if 'text' in data: +[default5]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default7]: iteration = train(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default7]: train_step(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default7]: loss = model[0].train_batch(data_iter=data_iterator) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default7]: self._exec_schedule(sched) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default7]: self._exec_instr(**cmd.kwargs) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default7]: batch = self._next_batch() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default7]: batch = self.batch_fn(batch) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default7]: if 'text' in data: +[default7]:TypeError: argument of type 'NoneType' is not iterable +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default7]: iteration = train(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default7]: train_step(forward_step_func, +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default5]: iteration = train(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default5]: train_step(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default7]: loss = model[0].train_batch(data_iter=data_iterator) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default5]: loss = model[0].train_batch(data_iter=data_iterator) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default7]: self._exec_schedule(sched) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default7]: self._exec_instr(**cmd.kwargs) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default7]: batch = self._next_batch() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default7]: batch = self.batch_fn(batch) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default7]: if 'text' in data: +[default7]:TypeError: argument of type 'NoneType' is not iterable +[default5]: self._exec_schedule(sched) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default5]: self._exec_instr(**cmd.kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default5]: batch = self._next_batch() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default5]: batch = self.batch_fn(batch) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default5]: if 'text' in data: +[default5]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default0]:[before the start of training step] datetime: 2022-10-06 14:27:20 +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default7]: iteration = train(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default7]: train_step(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default7]: loss = model[0].train_batch(data_iter=data_iterator) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default7]: self._exec_schedule(sched) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default7]: self._exec_instr(**cmd.kwargs) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default7]: batch = self._next_batch() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default7]: batch = self.batch_fn(batch) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default7]: if 'text' in data: +[default7]:TypeError: argument of type 'NoneType' is not iterable +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2886094 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2886095 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2886096 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2886097 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2886098 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2886099 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2886100 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3751085 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3751086 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3751087 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3751089 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3751091 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 905476 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 905477 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 905478 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 905480 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 905481 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 905482 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1098764 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1098766 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1098768 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1098770 closing signal SIGTERM +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 3 (pid: 3751088) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 3 (pid: 905479) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 1098765) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 7 (pid: 2886101) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 323.8155674934387 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_14:27:20 + host : jean-zay-iam24-ib0 + rank : 31 (local_rank: 7) + exitcode : 1 (pid: 905483) + error_file: /tmp/torchelastic_hj8_bb3z/none_85vd428l/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_14:27:20 + host : jean-zay-iam24-ib0 + rank : 27 (local_rank: 3) + exitcode : 1 (pid: 905479) + error_file: /tmp/torchelastic_hj8_bb3z/none_85vd428l/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +============================================================ +srun: error: jean-zay-iam24: task 3: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2076732.0 +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 328.35581159591675 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_14:27:20 + host : jean-zay-iam23-ib0 + rank : 23 (local_rank: 7) + exitcode : 1 (pid: 2886101) + error_file: /tmp/torchelastic_23hco9hf/none_84txc7ca/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +============================================================ +slurmstepd: error: *** STEP 2076732.0 ON jean-zay-iam12 CANCELLED AT 2022-10-06T14:32:50 *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1757117 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 98961 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3443006 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3443007 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3919266 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1757118 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 98962 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3919267 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3443008 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1757119 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 98963 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3919268 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1757120 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3443009 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 98964 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3919269 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1757121 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3443010 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3919270 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 98965 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1757122 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3443011 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 98966 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1757123 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3919271 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3443012 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 98967 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 98968 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3443013 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3919272 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3919273 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1757124 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + self._exit_barrier() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3751046 got signal: 15 +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + self._exit_barrier() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 1098724 got signal: 15 +WARNING:torch.distributed.elastic.rendezvous.dynamic_rendezvous:The node 'jean-zay-iam27-ib0_3442968_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. +srun: error: jean-zay-iam23: task 2: Exited with exit code 1 +WARNING:torch.distributed.elastic.rendezvous.dynamic_rendezvous:The node 'jean-zay-iam28-ib0_3919228_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. +srun: error: jean-zay-iam12: task 0: Exited with exit code 1 +WARNING:torch.distributed.elastic.rendezvous.dynamic_rendezvous:The node 'jean-zay-iam29-ib0_1757079_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. +WARNING:torch.distributed.elastic.rendezvous.dynamic_rendezvous:The node 'jean-zay-iam25-ib0_98922_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. +srun: error: jean-zay-iam22: task 1: Exited with exit code 1 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 1757079 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 98922 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3919228 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3442968 got signal: 15 +srun: error: jean-zay-iam29: task 7: Exited with exit code 1 +srun: error: jean-zay-iam25: task 4: Exited with exit code 1 +srun: error: jean-zay-iam28: task 6: Exited with exit code 1 +srun: error: jean-zay-iam27: task 5: Exited with exit code 1 +/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) + return torch._C._cuda_getDeviceCount() > 0 +/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) + return torch._C._cuda_getDeviceCount() > 0 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default0]: return torch._C._cuda_getDeviceCount() > 0 +[default2]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default2]: return torch._C._cuda_getDeviceCount() > 0 +[default1]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default1]: return torch._C._cuda_getDeviceCount() > 0 +[default3]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default3]: return torch._C._cuda_getDeviceCount() > 0 +[default4]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default4]: return torch._C._cuda_getDeviceCount() > 0 +[default6]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default6]: return torch._C._cuda_getDeviceCount() > 0 +[default5]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default5]: return torch._C._cuda_getDeviceCount() > 0 +[default7]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default7]: return torch._C._cuda_getDeviceCount() > 0 +[default0]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default0]: return torch._C._cuda_getDeviceCount() > 0 +[default2]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default2]: return torch._C._cuda_getDeviceCount() > 0 +[default3]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default3]: return torch._C._cuda_getDeviceCount() > 0 +[default4]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default4]: return torch._C._cuda_getDeviceCount() > 0 +[default5]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default5]: return torch._C._cuda_getDeviceCount() > 0 +[default7]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default7]: return torch._C._cuda_getDeviceCount() > 0 +[default1]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default1]: return torch._C._cuda_getDeviceCount() > 0 +[default6]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default6]: return torch._C._cuda_getDeviceCount() > 0 +[default0]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default2]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default3]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default4]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default6]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default7]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default1]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default4]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default3]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default0]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default2]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default7]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default1]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default6]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default3]:Traceback (most recent call last): +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default0]:AssertionError: Megatron requires CUDA. +[default1]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]:AssertionError: Megatron requires CUDA. +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default3]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default3]:AssertionError: Megatron requires CUDA. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default4]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]:AssertionError: Megatron requires CUDA. +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default7]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default7]:AssertionError: Megatron requires CUDA. +[default6]:Traceback (most recent call last): +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default5]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default2]:AssertionError: Megatron requires CUDA. +[default6]: return f(*args, **kwargs) +[default5]:AssertionError: Megatron requires CUDA. +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default6]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default6]:AssertionError: Megatron requires CUDA. +[default3]:Traceback (most recent call last): +[default1]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default3]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default3]:AssertionError: Megatron requires CUDA. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]:Traceback (most recent call last): +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default1]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default1]:AssertionError: Megatron requires CUDA. +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default4]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default4]:AssertionError: Megatron requires CUDA. +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default2]: main() +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default0]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default2]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default0]:AssertionError: Megatron requires CUDA. +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]:AssertionError: Megatron requires CUDA. +[default5]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default5]:AssertionError: Megatron requires CUDA. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default6]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default6]:AssertionError: Megatron requires CUDA. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default7]:AssertionError: Megatron requires CUDA. +[default0]:using world size: 64, data-parallel-size: 16, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 16 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2094446.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 250 +[default0]: eval_iters ...................................... 5 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 8192 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 2048 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 128 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 64 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 128 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-07 08:52:21,458] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 184143) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 177708) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 315.993688583374 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_08:52:20 + host : jean-zay-iam48-ib0 + rank : 57 (local_rank: 1) + exitcode : 1 (pid: 177709) + error_file: /tmp/torchelastic_puj1lyea/none_8mdm00r2/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[2]: + time : 2022-10-07_08:52:20 + host : jean-zay-iam48-ib0 + rank : 58 (local_rank: 2) + exitcode : 1 (pid: 177710) + error_file: /tmp/torchelastic_puj1lyea/none_8mdm00r2/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[3]: + time : 2022-10-07_08:52:20 + host : jean-zay-iam48-ib0 + rank : 59 (local_rank: 3) + exitcode : 1 (pid: 177711) + error_file: /tmp/torchelastic_puj1lyea/none_8mdm00r2/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[4]: + time : 2022-10-07_08:52:20 + host : jean-zay-iam48-ib0 + rank : 60 (local_rank: 4) + exitcode : 1 (pid: 177712) + error_file: /tmp/torchelastic_puj1lyea/none_8mdm00r2/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[5]: + time : 2022-10-07_08:52:20 + host : jean-zay-iam48-ib0 + rank : 61 (local_rank: 5) + exitcode : 1 (pid: 177713) + error_file: /tmp/torchelastic_puj1lyea/none_8mdm00r2/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[6]: + time : 2022-10-07_08:52:20 + host : jean-zay-iam48-ib0 + rank : 62 (local_rank: 6) + exitcode : 1 (pid: 177714) + error_file: /tmp/torchelastic_puj1lyea/none_8mdm00r2/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[7]: + time : 2022-10-07_08:52:20 + host : jean-zay-iam48-ib0 + rank : 63 (local_rank: 7) + exitcode : 1 (pid: 177715) + error_file: /tmp/torchelastic_puj1lyea/none_8mdm00r2/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_08:52:20 + host : jean-zay-iam48-ib0 + rank : 56 (local_rank: 0) + exitcode : 1 (pid: 177708) + error_file: /tmp/torchelastic_puj1lyea/none_8mdm00r2/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +============================================================ +srun: error: jean-zay-iam48: task 7: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2094446.0 +slurmstepd: error: *** STEP 2094446.0 ON jean-zay-iam38 CANCELLED AT 2022-10-07T08:57:40 *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 374957 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346595 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365186 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346548 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346549 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 374958 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 374959 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 374960 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 302754 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289181 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346550 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365187 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289182 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346596 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365188 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346597 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365189 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365190 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 302755 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289183 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 302756 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 374961 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 302757 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289184 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 374962 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346551 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289185 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346552 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346598 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289186 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346553 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365191 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346554 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346599 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365192 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346555 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365193 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346600 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 374963 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 302758 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 374964 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346601 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 302759 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 302760 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 346602 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 302761 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289187 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289188 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + self._exit_barrier() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 184123 got signal: 15 +srun: error: jean-zay-iam47: task 6: Exited with exit code 1 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 346499 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 346551 got signal: 15 + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 302711 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 289139 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 365137 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 374917 got signal: 15 +srun: error: jean-zay-iam43: task 5: Exited with exit code 1 +srun: error: jean-zay-iam42: task 4: Exited with exit code 1 +srun: error: jean-zay-iam39: task 1: Exited with exit code 1 +srun: error: jean-zay-iam40: task 2: Exited with exit code 1 +srun: error: jean-zay-iam41: task 3: Exited with exit code 1 +srun: error: jean-zay-iam38: task 0: Exited with exit code 1 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:using world size: 64, data-parallel-size: 16, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 16 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2094507.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 250 +[default0]: eval_iters ...................................... 5 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 8192 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 2048 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 128 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 64 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 128 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-07 09:01:30,008] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default7]:> setting tensorboard ... +[default0]:> initializing tensor model parallel with size 2 +[default0]:> initializing pipeline model parallel with size 2 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-07 09:01:32,064] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.086 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 6.079 seconds +[default0]:time to initialize megatron (seconds): -45.769 +[default0]:[after megatron is initialized] datetime: 2022-10-07 09:01:38 +[default0]:building GPT model ... +[default0]:[2022-10-07 09:01:38,283] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-07 09:01:38,283] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-07 09:01:38,283] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.82 GB, percent = 6.5% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=1, model=0): 2, ProcessCoord(pipe=0, data=1, model=1): 3, ProcessCoord(pipe=0, data=2, model=0): 4, ProcessCoord(pipe=0, data=2, model=1): 5, ProcessCoord(pipe=0, data=3, model=0): 6, ProcessCoord(pipe=0, data=3, model=1): 7, ProcessCoord(pipe=0, data=4, model=0): 8, ProcessCoord(pipe=0, data=4, model=1): 9, ProcessCoord(pipe=0, data=5, model=0): 10, ProcessCoord(pipe=0, data=5, model=1): 11, ProcessCoord(pipe=0, data=6, model=0): 12, ProcessCoord(pipe=0, data=6, model=1): 13, ProcessCoord(pipe=0, data=7, model=0): 14, ProcessCoord(pipe=0, data=7, model=1): 15, ProcessCoord(pipe=0, data=8, model=0): 16, ProcessCoord(pipe=0, data=8, model=1): 17, ProcessCoord(pipe=0, data=9, model=0): 18, ProcessCoord(pipe=0, data=9, model=1): 19, ProcessCoord(pipe=0, data=10, model=0): 20, ProcessCoord(pipe=0, data=10, model=1): 21, ProcessCoord(pipe=0, data=11, model=0): 22, ProcessCoord(pipe=0, data=11, model=1): 23, ProcessCoord(pipe=0, data=12, model=0): 24, ProcessCoord(pipe=0, data=12, model=1): 25, ProcessCoord(pipe=0, data=13, model=0): 26, ProcessCoord(pipe=0, data=13, model=1): 27, ProcessCoord(pipe=0, data=14, model=0): 28, ProcessCoord(pipe=0, data=14, model=1): 29, ProcessCoord(pipe=0, data=15, model=0): 30, ProcessCoord(pipe=0, data=15, model=1): 31, ProcessCoord(pipe=1, data=0, model=0): 32, ProcessCoord(pipe=1, data=0, model=1): 33, ProcessCoord(pipe=1, data=1, model=0): 34, ProcessCoord(pipe=1, data=1, model=1): 35, ProcessCoord(pipe=1, data=2, model=0): 36, ProcessCoord(pipe=1, data=2, model=1): 37, ProcessCoord(pipe=1, data=3, model=0): 38, ProcessCoord(pipe=1, data=3, model=1): 39, ProcessCoord(pipe=1, data=4, model=0): 40, ProcessCoord(pipe=1, data=4, model=1): 41, ProcessCoord(pipe=1, data=5, model=0): 42, ProcessCoord(pipe=1, data=5, model=1): 43, ProcessCoord(pipe=1, data=6, model=0): 44, ProcessCoord(pipe=1, data=6, model=1): 45, ProcessCoord(pipe=1, data=7, model=0): 46, ProcessCoord(pipe=1, data=7, model=1): 47, ProcessCoord(pipe=1, data=8, model=0): 48, ProcessCoord(pipe=1, data=8, model=1): 49, ProcessCoord(pipe=1, data=9, model=0): 50, ProcessCoord(pipe=1, data=9, model=1): 51, ProcessCoord(pipe=1, data=10, model=0): 52, ProcessCoord(pipe=1, data=10, model=1): 53, ProcessCoord(pipe=1, data=11, model=0): 54, ProcessCoord(pipe=1, data=11, model=1): 55, ProcessCoord(pipe=1, data=12, model=0): 56, ProcessCoord(pipe=1, data=12, model=1): 57, ProcessCoord(pipe=1, data=13, model=0): 58, ProcessCoord(pipe=1, data=13, model=1): 59, ProcessCoord(pipe=1, data=14, model=0): 60, ProcessCoord(pipe=1, data=14, model=1): 61, ProcessCoord(pipe=1, data=15, model=0): 62, ProcessCoord(pipe=1, data=15, model=1): 63} +[default0]:[2022-10-07 09:01:39,129] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=15 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]:stage=1 layers=16 +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-07 09:01:40,017] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-07 09:01:40,018] [INFO] [utils.py:828:see_memory_usage] MA 1.04 GB Max_MA 1.04 GB CA 1.04 GB Max_CA 1 GB +[default0]:[2022-10-07 09:01:40,018] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.24 GB, percent = 6.6% +[default0]:setting training iterations to 3100 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-07 09:01:40,020] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-07 09:01:41,103] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default0]:[2022-10-07 09:01:41,103] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default0]:[2022-10-07 09:01:41,103] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default0]:[2022-10-07 09:01:41,106] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2022-10-07 09:01:41,106] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[default0]:[2022-10-07 09:01:41,106] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer +[default0]:[2022-10-07 09:01:41,106] [INFO] [stage_1_and_2.py:134:__init__] Reduce bucket size 500000000 +[default0]:[2022-10-07 09:01:41,106] [INFO] [stage_1_and_2.py:135:__init__] Allgather bucket size 500000000 +[default0]:[2022-10-07 09:01:41,106] [INFO] [stage_1_and_2.py:136:__init__] CPU Offload: False +[default0]:[2022-10-07 09:01:41,106] [INFO] [stage_1_and_2.py:137:__init__] Round robin gradient partitioning: False +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Emitting ninja build file /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113/utils/build.ninja... +[default6]:Building extension module utils... +[default6]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default6]:ninja: no work to do. +[default0]:Loading extension module utils... +[default2]:Loading extension module utils... +[default1]:Loading extension module utils... +[default3]:Loading extension module utils... +[default4]:Loading extension module utils... +[default6]:Loading extension module utils... +[default7]:Loading extension module utils... +[default5]:Loading extension module utils... +[default4]:Loading extension module utils... +[default2]:Loading extension module utils... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.8019776344299316 seconds +[default2]:Time to load utils op: 0.8022556304931641 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.7888736724853516 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7883350849151611 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.7391223907470703 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7883419990539551 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.8021972179412842 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.7885668277740479 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7783148288726807 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.7819583415985107 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.8063898086547852 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.806389570236206 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.8063948154449463 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.8063857555389404 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.8063838481903076 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7621660232543945 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.8060042858123779 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.8063879013061523 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.8063933849334717 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.7926011085510254 seconds +[default3]:Loading extension module utils... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.792579174041748 seconds +[default3]:Time to load utils op: 0.7781679630279541 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.7926058769226074 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.792588472366333 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.7782423496246338 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.7787699699401855 seconds +[default0]:Time to load utils op: 0.7491698265075684 seconds +[default2]:Time to load utils op: 0.7491793632507324 seconds +[default3]:Time to load utils op: 0.7355475425720215 seconds +[default1]:Time to load utils op: 0.7359676361083984 seconds +[default4]:Time to load utils op: 0.7491638660430908 seconds +[default6]:Time to load utils op: 0.7491645812988281 seconds +[default7]:Time to load utils op: 0.7356264591217041 seconds +[default5]:Time to load utils op: 0.7356116771697998 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.7812683582305908 seconds +[default4]:Time to load utils op: 0.7466123104095459 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.75823974609375 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7574918270111084 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.7784686088562012 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7353103160858154 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.7593319416046143 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7339518070220947 seconds +[default7]:Loading extension module utils... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.7391815185546875 seconds +[default7]:Time to load utils op: 0.7357239723205566 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.7399265766143799 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.7393078804016113 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.7394003868103027 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.7352516651153564 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.7739708423614502 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.773963451385498 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.7739717960357666 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.7739577293395996 seconds +[default5]:Loading extension module utils... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.77396559715271 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.7739593982696533 seconds +[default5]:Time to load utils op: 0.7739646434783936 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7740089893341064 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.783876895904541 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7769427299499512 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.7859728336334229 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.7845895290374756 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.7848038673400879 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.7844955921173096 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.7839844226837158 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.7846987247467041 seconds +[default6]:Rank: 38 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 27 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 26 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 30 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 31 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 39 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 16 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 18 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 4 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 5 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 10 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 42 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 62 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 63 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 17 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 22 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 23 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 19 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 48 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 11 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 14 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 15 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 43 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 46 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 47 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 58 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 54 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 51 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 59 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 49 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 41 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 56 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 57 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 7 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default6]:Rank: 6 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 50 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default7]:Rank: 55 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 40 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 52 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 53 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 32 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 33 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 28 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 29 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 25 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 24 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 36 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 20 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 21 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 12 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 13 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 34 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 35 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 37 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 61 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 3 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 2 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 60 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Rank: 44 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default5]:Rank: 45 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 1 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 0 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0005953311920166016 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0005824565887451172 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0005369186401367188 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0005612373352050781 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0005290508270263672 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0004363059997558594 seconds +[default7]:Time to load utils op: 0.0004107952117919922 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0005574226379394531 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0010471343994140625 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0006661415100097656 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0005824565887451172 seconds +[default0]:Rank: 8 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0005831718444824219 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0005815029144287109 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default1]:Rank: 9 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Time to load utils op: 0.0012385845184326172 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0005862712860107422 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0006024837493896484 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.00047135353088378906 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0009822845458984375 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0005471706390380859 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0005414485931396484 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.000553131103515625 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0005600452423095703 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0005464553833007812 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0011987686157226562 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0005543231964111328 seconds +[default5]:Time to load utils op: 0.002550840377807617 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0014226436614990234 seconds +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.000988006591796875 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0012867450714111328 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0022699832916259766 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0009765625 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0006265640258789062 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0010759830474853516 seconds +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.002321958541870117 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0013165473937988281 seconds +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0008742809295654297 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0006060600280761719 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0026781558990478516 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0011484622955322266 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0006732940673828125 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0006077289581298828 seconds +[default7]:Time to load utils op: 0.001756429672241211 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0005893707275390625 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0005981922149658203 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0021860599517822266 seconds +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0006403923034667969 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0007674694061279297 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0011212825775146484 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0005156993865966797 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0005171298980712891 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0008482933044433594 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0008857250213623047 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0005118846893310547 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0008385181427001953 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0005450248718261719 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0012278556823730469 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.001299142837524414 seconds +[default0]:[2022-10-07 09:01:46,413] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states +[default0]:[2022-10-07 09:01:46,414] [INFO] [utils.py:828:see_memory_usage] MA 1.17 GB Max_MA 1.19 GB CA 1.79 GB Max_CA 2 GB +[default0]:[2022-10-07 09:01:46,414] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.98 GB, percent = 7.3% +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0017936229705810547 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.002022981643676758 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0015180110931396484 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0015060901641845703 seconds +[default7]:Time to load utils op: 0.0023856163024902344 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0014371871948242188 seconds +[default0]:[2022-10-07 09:01:46,487] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states +[default0]:[2022-10-07 09:01:46,487] [INFO] [utils.py:828:see_memory_usage] MA 1.43 GB Max_MA 1.56 GB CA 2.14 GB Max_CA 2 GB +[default0]:[2022-10-07 09:01:46,487] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.98 GB, percent = 7.3% +[default0]:[2022-10-07 09:01:46,488] [INFO] [stage_1_and_2.py:516:__init__] optimizer state initialized +[default0]:[2022-10-07 09:01:46,515] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer +[default0]:[2022-10-07 09:01:46,515] [INFO] [utils.py:828:see_memory_usage] MA 1.43 GB Max_MA 1.43 GB CA 2.14 GB Max_CA 2 GB +[default0]:[2022-10-07 09:01:46,515] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.98 GB, percent = 7.3% +[default0]:[2022-10-07 09:01:46,515] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2022-10-07 09:01:46,516] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2022-10-07 09:01:46,516] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2022-10-07 09:01:46,516] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2022-10-07 09:01:46,516] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] gradient_accumulation_steps .. 128 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] train_batch_size ............. 2048 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 1 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] world_size ................... 16 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] zero_enabled ................. True +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:991:print] zero_optimization_stage ...... 1 +[default0]:[2022-10-07 09:01:46,517] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 1, +[default0]: "train_batch_size": 2.048000e+03, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 1 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0004749298095703125 seconds +[default0]:[2022-10-07 09:01:46,518] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=128 micro_batch_size=1 +[default0]:[2022-10-07 09:01:46,961] [INFO] [engine.py:145:__init__] RANK=32 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default1]:[2022-10-07 09:01:46,961] [INFO] [engine.py:145:__init__] RANK=33 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default1]:[2022-10-07 09:01:46,961] [INFO] [engine.py:145:__init__] RANK=1 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default0]:[2022-10-07 09:01:46,961] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default4]:[2022-10-07 09:01:47,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 09:01:47,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:47,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 09:01:47,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 09:01:47,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 09:01:47,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 09:01:47,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:47,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 09:01:47,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:47,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 09:01:47,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 09:01:47,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 09:01:47,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:47,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 09:01:47,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 09:01:47,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 09:01:47,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 09:01:47,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:47,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:47,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 09:01:47,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 09:01:47,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 09:01:47,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 09:01:47,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:47,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 09:01:47,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:47,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 09:01:47,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 09:01:47,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 09:01:47,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 09:01:47,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 09:01:47,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 09:01:47,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:47,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:47,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 09:01:47,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 09:01:47,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 09:01:47,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 09:01:47,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 09:01:47,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 09:01:47,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 09:01:47,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 09:01:47,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 09:01:47,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 09:01:47,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 09:01:47,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:47,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:47,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 09:01:47,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 09:01:47,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:47,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 09:01:47,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 09:01:47,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 09:01:47,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 09:01:47,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:47,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 09:01:47,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 09:01:47,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:47,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 09:01:47,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 09:01:47,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:47,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 09:01:47,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 09:01:47,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 09:01:48,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 09:01:48,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default1]:[2022-10-07 09:01:48,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 09:01:48,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default6]:[2022-10-07 09:01:48,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:48,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:48,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:48,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-07 09:01:48,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 09:01:48,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-07 09:01:48,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:48,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-07 09:01:48,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 09:01:48,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-07 09:01:48,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:48,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:48,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:48,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-07 09:01:48,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default6]:[2022-10-07 09:01:48,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default4]:[2022-10-07 09:01:48,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default3]:[2022-10-07 09:01:48,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 09:01:48,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default2]:[2022-10-07 09:01:48,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:48,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 09:01:48,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 09:01:48,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-07 09:01:48,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 09:01:48,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:48,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 09:01:48,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default0]:[2022-10-07 09:01:48,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default3]:[2022-10-07 09:01:48,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 09:01:48,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default3]:[2022-10-07 09:01:48,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 09:01:48,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-07 09:01:48,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 09:01:48,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 09:01:48,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default4]:[2022-10-07 09:01:49,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:49,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 09:01:49,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 09:01:49,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 09:01:49,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default5]:[2022-10-07 09:01:49,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-07 09:01:49,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-07 09:01:48,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 09:01:48,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-07 09:01:49,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:49,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:48,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 09:01:49,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 09:01:48,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default1]:[2022-10-07 09:01:49,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 09:01:49,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default7]:[2022-10-07 09:01:49,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-07 09:01:49,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 09:01:49,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default2]:[2022-10-07 09:01:49,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:49,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:49,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 09:01:49,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 09:01:49,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:49,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default0]:[2022-10-07 09:01:49,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 09:01:49,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 09:01:49,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 09:01:49,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default6]:[2022-10-07 09:01:49,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:49,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default5]:[2022-10-07 09:01:49,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-07 09:01:49,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default4]:[2022-10-07 09:01:49,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:49,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default6]:[2022-10-07 09:01:49,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:49,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 09:01:49,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:49,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 09:01:49,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 09:01:49,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default5]:[2022-10-07 09:01:49,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-07 09:01:49,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default6]:[2022-10-07 09:01:49,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-07 09:01:49,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-07 09:01:49,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default6]:[2022-10-07 09:01:49,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 09:01:49,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:49,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default7]:[2022-10-07 09:01:49,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 09:01:49,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default4]:[2022-10-07 09:01:49,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:49,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 09:01:49,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 09:01:49,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default1]:[2022-10-07 09:01:49,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 09:01:49,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default1]:[2022-10-07 09:01:50,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 09:01:50,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-07 09:01:50,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 09:01:50,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 09:01:50,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default2]:[2022-10-07 09:01:50,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:50,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 09:01:50,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 09:01:50,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default5]:[2022-10-07 09:01:50,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:50,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 09:01:50,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:50,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default3]:[2022-10-07 09:01:50,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default4]:[2022-10-07 09:01:50,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:50,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default7]:[2022-10-07 09:01:50,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 09:01:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default5]:[2022-10-07 09:01:50,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default6]:[2022-10-07 09:01:50,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:50,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:50,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default4]:[2022-10-07 09:01:50,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:50,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 09:01:50,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:50,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:50,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default5]:[2022-10-07 09:01:50,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default4]:[2022-10-07 09:01:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:50,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default7]:[2022-10-07 09:01:50,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 09:01:50,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default0]:[2022-10-07 09:01:50,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 09:01:50,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:50,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 09:01:50,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:50,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 09:01:50,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:50,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:50,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:50,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:50,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:50,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:50,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:50,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:50,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-07 09:01:50,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:50,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-07 09:01:50,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:51,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 09:01:51,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 09:01:51,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 09:01:51,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:51,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 09:01:51,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:51,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:51,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 09:01:51,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-07 09:01:51,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 09:01:51,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 09:01:51,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:51,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:51,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:51,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:51,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 09:01:51,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 09:01:51,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:51,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 09:01:51,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:51,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-07 09:01:51,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:51,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-07 09:01:51,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:51,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 09:01:51,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:51,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 09:01:51,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:51,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:51,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:51,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:51,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:51,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:51,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:51,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:51,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:51,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:51,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:51,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:51,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:51,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 09:01:51,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:51,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:51,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:51,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:51,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:51,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:51,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:51,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:51,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:51,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:51,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:51,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:51,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:51,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:51,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:51,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:51,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:51,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:51,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:51,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:51,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:51,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default0]:[2022-10-07 09:01:51,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:51,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-07 09:01:52,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:51,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:51,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:51,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:51,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:51,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:51,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:51,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:51,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:51,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:51,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:51,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:51,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:51,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:51,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:51,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:51,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:51,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:51,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:51,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:51,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:51,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default0]:[2022-10-07 09:01:52,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-07 09:01:52,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:52,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default4]:[2022-10-07 09:01:52,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-07 09:01:52,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:52,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 09:01:52,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-07 09:01:52,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 09:01:52,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 09:01:52,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 09:01:52,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-07 09:01:52,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-07 09:01:52,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-07 09:01:52,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-07 09:01:52,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:52,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:52,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 09:01:52,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 09:01:52,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:52,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:52,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:52,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 09:01:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 09:01:52,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default4]:[2022-10-07 09:01:52,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default4]:[2022-10-07 09:01:52,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:52,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 09:01:52,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 09:01:52,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-07 09:01:52,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:52,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-07 09:01:52,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default0]:[2022-10-07 09:01:52,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default0]:[2022-10-07 09:01:52,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default2]:[2022-10-07 09:01:52,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default4]:[2022-10-07 09:01:52,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:52,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:52,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:52,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,669] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default4]:[2022-10-07 09:01:52,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:52,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-07 09:01:52,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default0]:[2022-10-07 09:01:52,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default4]:[2022-10-07 09:01:52,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-07 09:01:52,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 09:01:52,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,871] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 09:01:52,871] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-07 09:01:52,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default4]:[2022-10-07 09:01:52,871] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-07 09:01:52,871] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-07 09:01:52,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default6]:[2022-10-07 09:01:52,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:52,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:52,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:52,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default0]:[2022-10-07 09:01:52,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:52,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:52,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 09:01:52,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:52,865] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-07 09:01:52,865] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-07 09:01:52,865] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-07 09:01:52,866] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 09:01:52,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:52,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 09:01:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,010] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,011] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-07 09:01:52,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:52,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,011] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,011] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default3]:[2022-10-07 09:01:52,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default4]:[2022-10-07 09:01:52,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-07 09:01:52,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:52,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:52,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:52,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:52,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:52,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:52,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:52,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default2]:[2022-10-07 09:01:52,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:52,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:52,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:52,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default0]:[2022-10-07 09:01:52,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:52,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default2]:[2022-10-07 09:01:52,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:52,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:52,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:52,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:52,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:53,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:52,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:52,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:52,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:52,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:52,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:52,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:52,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default2]:[2022-10-07 09:01:53,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:53,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 09:01:53,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:53,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default0]:[2022-10-07 09:01:53,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:53,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default6]:[2022-10-07 09:01:53,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:53,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 09:01:53,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,224] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,224] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,225] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,225] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-07 09:01:53,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:53,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default4]:[2022-10-07 09:01:53,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default6]:[2022-10-07 09:01:53,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:53,224] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,225] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 09:01:53,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,235] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,223] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,223] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,235] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:53,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default0]:[2022-10-07 09:01:53,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-07 09:01:53,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,299] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,299] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,331] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 09:01:53,331] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,236] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,236] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default4]:[2022-10-07 09:01:53,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,316] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoint[default2]:[2022-10-07 09:01:53,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,341] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/globa[default5]:[2022-10-07 09:01:53,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +l_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 09:01:53,341] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:53,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,330] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 09:01:53,330] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default6]:[2022-10-07 09:01:53,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +s/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 09:01:53,342] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,316] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 09:01:53,342] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,316] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,316] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,437] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-07 09:01:53,438] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default2]:[2022-10-07 09:01:53,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,401] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,401] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default5]:[2022-10-07 09:01:53,438] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-07 09:01:53,438] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,442] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,442] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default4]:[2022-10-07 09:01:53,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-07 09:01:53,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-07 09:01:53,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:53,430] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,430] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-07 09:01:53,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:53,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,429] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,430] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 09:01:53,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,401] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/globa[default0]:[2022-10-07 09:01:53,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 09:01:53,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +l_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoint[default0]:[2022-10-07 09:01:53,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +s/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,401] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-07 09:01:53,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default0]:could not find arguments in the checkpoint ... +[default0]: checkpoint version 3.0 +[default1]:[2022-10-07 09:01:53,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-07 09:01:53,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,442] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,442] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,511] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 09:01:53,511] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,520] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,521] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-07 09:01:53,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:53,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default4]:[2022-10-07 09:01:53,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default4]:[2022-10-07 09:01:53,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 09:01:53,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,510] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 09:01:53,511] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 09:01:53,519] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 09:01:53,520] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default6]:[2022-10-07 09:01:53,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-07 09:01:53,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 09:01:53,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-07 09:01:53,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default1]:[2022-10-07 09:01:53,629] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 09:01:53,629] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 09:01:53,629] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 09:01:53,629] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,578] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,578] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-07 09:01:53,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,593] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/globa[default5]:[2022-10-07 09:01:53,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +l_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-07 09:01:53,593] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default2]:[2022-10-07 09:01:53,577] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoint[default5]:[2022-10-07 09:01:53,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,553] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,553] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default4]:[2022-10-07 09:01:53,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,552] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,552] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default4]:[2022-10-07 09:01:53,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,571] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/globa[default7]:[2022-10-07 09:01:53,571] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpointl_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +s/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,571] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmi[default7]:[2022-10-07 09:01:53,571] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_0xnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +1_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,592] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-07 09:01:53,592] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default6]:[2022-10-07 09:01:53,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +s/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,577] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-07 09:01:53,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default2]:[2022-10-07 09:01:53,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 09:01:53,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 09:01:53,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 09:01:53,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 09:01:53,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default6]:[2022-10-07 09:01:53,690] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-07 09:01:53,690] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default5]:[2022-10-07 09:01:53,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,689] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-07 09:01:53,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,669] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:53,689] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-07 09:01:53,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default4]:[2022-10-07 09:01:53,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:53,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default2]:[2022-10-07 09:01:53,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 09:01:53,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 09:01:53,827] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,827] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,782] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,782] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,782] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 09:01:53,782] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,828] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 09:01:53,828] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default5]:[2022-10-07 09:01:53,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default4]:[2022-10-07 09:01:53,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default6]:[2022-10-07 09:01:53,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,932] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-07 09:01:53,932] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,932] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-07 09:01:53,932] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:53,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default4]:[2022-10-07 09:01:53,893] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-07 09:01:53,893] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-07 09:01:53,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,892] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 09:01:53,893] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,947] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,947] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default6]:[2022-10-07 09:01:53,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,948] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,948] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-07 09:01:53,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:54,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:54,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:54,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:54,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:54,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default6]:[2022-10-07 09:01:54,042] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-07 09:01:54,042] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-07 09:01:53,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:53,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:53,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:53,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:54,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:54,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:54,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:54,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:54,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:53,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:53,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:53,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:53,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:54,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:54,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:54,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:54,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default6]:[2022-10-07 09:01:53,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:53,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-07 09:01:53,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-07 09:01:54,042] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-07 09:01:54,042] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 09:01:53,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:53,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:53,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:54,064] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-07 09:01:54,064] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default4]:[2022-10-07 09:01:54,127] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-07 09:01:54,127] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-07 09:01:54,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:54,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:54,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:54,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:54,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-07 09:01:54,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:54,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:54,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:54,064] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/globa[default5]:[2022-10-07 09:01:54,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:54,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default7]:[2022-10-07 09:01:54,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +l_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-07 09:01:54,064] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmi[default7]:[2022-10-07 09:01:54,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:54,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +xnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_0[default5]:[2022-10-07 09:01:54,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +1_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-07 09:01:54,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-07 09:01:54,118] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:54,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:54,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default5]:[2022-10-07 09:01:54,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,126] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 09:01:54,127] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 09:01:54,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 09:01:54,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-07 09:01:54,202] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-07 09:01:54,203] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-07 09:01:54,217] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-07 09:01:54,218] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-07 09:01:54,234] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-07 09:01:54,235] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-07 09:01:54,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-07 09:01:54,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:54,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-07 09:01:54,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default5]:[2022-10-07 09:01:54,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:54,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default7]:[2022-10-07 09:01:54,217] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-07 09:01:54,217] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-07 09:01:54,234] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-07 09:01:54,234] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default4]:[2022-10-07 09:01:54,202] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 09:01:54,202] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]: successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq at iteration 0 +[default7]:time (ms) | load-checkpoint: 7118.67 +[default0]:estimated model parameters: 2.236514304 +[default0]:estimated model parameters without embeddings: 1.208909824 +[default0]:[after model, optimizer, and learning rate scheduler are built] datetime: 2022-10-07 09:01:54 +[default0]:> building train, validation, and test datasets ... +[default0]: > datasets target sizes (minimum size): +[default0]: train: 6348800 +[default0]: validation: 133120 +[default0]: test: 10240 +[default0]:> building train, validation, and test datasets for T0 ... +[default0]: > building dataset index ... +[default0]:/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/utils.py:365: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +[default0]: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.294827 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 29920425) total of 29920425 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.313728 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002656 seconds +[default0]: number of documents: 31495184 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.132 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049505 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4893782) total of 4893782 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.149351 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002295 seconds +[default0]: number of documents: 5151349 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.123 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.104822 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3384633) total of 3384633 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044062 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003659 seconds +[default0]: number of documents: 3562772 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.092 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.085677 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2572338) total of 2572338 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.124638 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003422 seconds +[default0]: number of documents: 2707724 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.107 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.129537 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4803145) total of 4803145 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038645 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002897 seconds +[default0]: number of documents: 5055942 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.093 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.106836 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2041507) total of 2041507 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051933 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003834 seconds +[default0]: number of documents: 2148955 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.086 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.055529 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2496022) total of 2496022 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.067639 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003186 seconds +[default0]: number of documents: 2627392 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.091 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.071100 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3382528) total of 3382528 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.136047 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003400 seconds +[default0]: number of documents: 3560556 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.095 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.061854 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1466269) total of 1466269 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.063912 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003856 seconds +[default0]: number of documents: 1543441 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.090 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.074141 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1583941) total of 1583941 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.063017 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003589 seconds +[default0]: number of documents: 1667306 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.079 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.071542 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 812968) total of 812968 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.061807 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002970 seconds +[default0]: number of documents: 855756 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.068 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.064085 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 544696) total of 544696 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049706 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002191 seconds +[default0]: number of documents: 573364 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.065 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.039725 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 390101) total of 390101 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.040539 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001203 seconds +[default0]: number of documents: 410633 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.056 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.050339 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 407401) total of 407401 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048998 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001718 seconds +[default0]: number of documents: 428843 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.072 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.041840 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 396406) total of 396406 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051083 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000999 seconds +[default0]: number of documents: 417269 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.067 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.070101 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1058732) total of 1058732 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.056839 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003785 seconds +[default0]: number of documents: 1114455 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.071 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.040487 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 330124) total of 330124 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.039893 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001104 seconds +[default0]: number of documents: 347499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.072 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.035856 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 322250) total of 322250 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046204 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001212 seconds +[default0]: number of documents: 339210 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.056 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.041873 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 299966) total of 299966 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.040462 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001243 seconds +[default0]: number of documents: 315754 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.039 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046476 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 872495) total of 872495 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046127 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003567 seconds +[default0]: number of documents: 918416 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.052 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.056661 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 902592) total of 902592 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048424 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003064 seconds +[default0]: number of documents: 950097 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.063 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051609 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869310) total of 869310 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.053678 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003240 seconds +[default0]: number of documents: 915063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.056 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.053201 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869308) total of 869308 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.050471 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003102 seconds +[default0]: number of documents: 915061 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.069 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048811 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869305) total of 869305 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.053272 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003640 seconds +[default0]: number of documents: 915058 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.056 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.054881 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 821803) total of 821803 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049015 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004083 seconds +[default0]: number of documents: 865056 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.062 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.052749 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869292) total of 869292 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051374 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003577 seconds +[default0]: number of documents: 915044 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.065 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048989 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869291) total of 869291 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048458 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002571 seconds +[default0]: number of documents: 915043 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.052 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.054973 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869270) total of 869270 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.050302 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003075 seconds +[default0]: number of documents: 915021 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.058 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.053808 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869301) total of 869301 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048779 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004036 seconds +[default0]: number of documents: 915054 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.074 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.043280 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869298) total of 869298 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.054205 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003562 seconds +[default0]: number of documents: 915051 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.048 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033974 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 302280) total of 302280 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.041670 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001160 seconds +[default0]: number of documents: 318189 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.052 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.037514 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 252571) total of 252571 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032823 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001127 seconds +[default0]: number of documents: 265864 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.049 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045245 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.040933 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000953 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.039 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.037636 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033579 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000947 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.032 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.084210 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.073886 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001229 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.044 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038453 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346807) total of 346807 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.039231 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001345 seconds +[default0]: number of documents: 365060 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.028 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.039160 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346810) total of 346810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038053 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001119 seconds +[default0]: number of documents: 365063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.037 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.035925 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036919 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000723 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.028 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.061288 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.042788 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000997 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.024 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038555 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033638 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000858 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.034 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038422 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033604 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000776 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.030 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033125 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 257631) total of 257631 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.035129 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000928 seconds +[default0]: number of documents: 271191 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.044 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.034280 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 256474) total of 256474 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.034900 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000822 seconds +[default0]: number of documents: 269973 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.029 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.047840 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038019 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000941 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.026 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.039834 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032304 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000813 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.043 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033005 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.043083 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000828 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.033 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387164 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786753 +[default0]: dataset 2, input: 0.0636898, achieved: 0.0636898 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584986 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576332 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485994 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478619 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476917 +[default0]: dataset 8, input: 0.045653, achieved: 0.045653 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322254 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199319 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138497 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960602 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865244 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692258 +[default0]: dataset 15, input: 0.00582803, achieved: 0.00582806 +[default0]: dataset 16, input: 0.00582586, achieved: 0.00582586 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543682 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409057 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366564 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337937 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282753 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00274012 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264622 +[default0]: dataset 24, input: 0.00262358, achieved: 0.0026236 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00260032 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259097 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245155 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244736 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238686 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200525 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181879 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171917 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167776 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162355 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131238 +[default0]: dataset 36, input: 0.00127347, achieved: 0.00127344 +[default0]: dataset 37, input: 0.00120564, achieved: 0.00120569 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119529 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118536 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117487 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00114929 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112334 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112315 +[default0]: dataset 44, input: 0.00111237, achieved: 0.00111236 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110444 +[default0]:> elapsed time for building blendable dataset indices: 0.52 (sec) +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.231408 seconds +[default0]: number of documents: 15234080 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [14472376, 15234080) total of 761704 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_4424ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_4424ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_4424ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.078 seconds +[default0]: total number of samples: 221750 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.173414 seconds +[default0]: number of documents: 6142390 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [5835270, 6142390) total of 307120 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_1505ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_1505ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_1505ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.064 seconds +[default0]: total number of samples: 136143 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.114803 seconds +[default0]: number of documents: 26176998 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [24868148, 26176998) total of 1308850 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_17429ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_17429ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_17429ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.103 seconds +[default0]: total number of samples: 432311 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.101528 seconds +[default0]: number of documents: 20844665 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [19802432, 20844665) total of 1042233 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_29662ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_29662ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_29662ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.103 seconds +[default0]: total number of samples: 521545 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.160002 seconds +[default0]: number of documents: 67005817 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [63655526, 67005817) total of 3350291 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_14273ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_14273ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_14273ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.115 seconds +[default0]: total number of samples: 1740321 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.142110 seconds +[default0]: number of documents: 5149795 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4892305, 5149795) total of 257490 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_209ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_209ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_209ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.045 seconds +[default0]: total number of samples: 26370 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.108135 seconds +[default0]: number of documents: 58847091 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [55904736, 58847091) total of 2942355 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_17465ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_17465ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_17465ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.115 seconds +[default0]: total number of samples: 1458654 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.147771 seconds +[default0]: number of documents: 12514253 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [11888540, 12514253) total of 625713 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_1461ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_1461ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_1461ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.063 seconds +[default0]: total number of samples: 134071 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.060754 seconds +[default0]: number of documents: 180608 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [171578, 180608) total of 9030 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_15ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_15ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_15ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: total number of samples: 2501 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.122916 seconds +[default0]: number of documents: 12303134 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [11687977, 12303134) total of 615157 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_735ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_735ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_735ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.071 seconds +[default0]: total number of samples: 157244 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.102819 seconds +[default0]: number of documents: 2033057 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [1931404, 2033057) total of 101653 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_54ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_54ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_54ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: total number of samples: 20517 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.098393 seconds +[default0]: number of documents: 26793553 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [25453875, 26793553) total of 1339678 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_1000ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_1000ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_1000ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.076 seconds +[default0]: total number of samples: 101502 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.141111 seconds +[default0]: number of documents: 3155990 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2998190, 3155990) total of 157800 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_83ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_83ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_83ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.043 seconds +[default0]: total number of samples: 44182 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.080838 seconds +[default0]: number of documents: 6692522 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [6357896, 6692522) total of 334626 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_139ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_139ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_139ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.043 seconds +[default0]: total number of samples: 47613 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.106277 seconds +[default0]: number of documents: 3017261 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2866398, 3017261) total of 150863 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_68ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_68ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_68ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.034 seconds +[default0]: total number of samples: 29298 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.093945 seconds +[default0]: number of documents: 3648041 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [3465639, 3648041) total of 182402 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_90ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_90ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_90ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: total number of samples: 5659 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.140260 seconds +[default0]: number of documents: 4327282 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4110918, 4327282) total of 216364 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_49ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_49ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_49ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.022 seconds +[default0]: total number of samples: 12423 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.157212 seconds +[default0]: number of documents: 2698896 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2563951, 2698896) total of 134945 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_69ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_69ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_69ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.025 seconds +[default0]: total number of samples: 19133 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.074237 seconds +[default0]: number of documents: 12767593 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [12129213, 12767593) total of 638380 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_283ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_283ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_283ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.037 seconds +[default0]: total number of samples: 87928 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.114973 seconds +[default0]: number of documents: 4342323 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4125207, 4342323) total of 217116 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_123ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_123ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_123ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.060 seconds +[default0]: total number of samples: 69780 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.077498 seconds +[default0]: number of documents: 3022722 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2871586, 3022722) total of 151136 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_167ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_167ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_167ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.024 seconds +[default0]: total number of samples: 22532 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.107718 seconds +[default0]: number of documents: 1162568 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [1104440, 1162568) total of 58128 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_43ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_43ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_43ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: total number of samples: 1608 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.185737 seconds +[default0]: number of documents: 55294645 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [52529913, 55294645) total of 2764732 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_10887ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_10887ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_10887ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.105 seconds +[default0]: total number of samples: 690621 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.152320 seconds +[default0]: number of documents: 44855616 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [42612835, 44855616) total of 2242781 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_7398ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_7398ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_7398ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.122 seconds +[default0]: total number of samples: 468689 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.175206 seconds +[default0]: number of documents: 31969891 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [30371396, 31969891) total of 1598495 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_6628ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_6628ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_6628ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.110 seconds +[default0]: total number of samples: 497625 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.098387 seconds +[default0]: number of documents: 34110375 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [32404856, 34110375) total of 1705519 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_3294ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_3294ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_3294ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.079 seconds +[default0]: total number of samples: 125120 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.074324 seconds +[default0]: number of documents: 43761623 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [41573542, 43761623) total of 2188081 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_16178ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_16178ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_16178ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.131 seconds +[default0]: total number of samples: 1010592 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.052552 seconds +[default0]: number of documents: 197602 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [187722, 197602) total of 9880 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_70ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_70ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_70ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: total number of samples: 4451 +[default0]: total number of epochs: 1 +[default0]:> building indices for blendable datasets ... +[default0]:Skipping sample id=210815. Maximum sequence length: 2049, sample length: 4593 +[default0]:Skipping sample id=615685. Maximum sequence length: 2049, sample length: 3740 +[default0]:Skipping sample id=1144204. Maximum sequence length: 2049, sample length: 5130 +[default0]:Skipping sample id=1069021. Maximum sequence length: 2049, sample length: 2877 +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.0330676, achieved: 0.0330676 +[default0]: dataset 1, input: 0.0112421, achieved: 0.0112421 +[default0]: dataset 2, input: 0.130272, achieved: 0.130272 +[default0]: dataset 3, input: 0.221712, achieved: 0.221712 +[default0]: dataset 4, input: 0.106678, achieved: 0.106678 +[default0]: dataset 5, input: 0.00155951, achieved: 0.00155955 +[default0]: dataset 6, input: 0.13054, achieved: 0.13054 +[default0]: dataset 7, input: 0.010918, achieved: 0.0109181 +[default0]: dataset 8, input: 0.000110214, achieved: 0.000110257 +[default0]: dataset 9, input: 0.00549238, achieved: 0.00549235 +[default0]: dataset 10, input: 0.000402122, achieved: 0.000402094 +[default0]: dataset 11, input: 0.00747007, achieved: 0.00747007 +[default0]: dataset 12, input: 0.000619047, achieved: 0.000619024 +[default0]: dataset 13, input: 0.00103353, achieved: 0.0010336 +[default0]: dataset 14, input: 0.000501201, achieved: 0.000501226 +[default0]: dataset 15, input: 0.000667277, achieved: 0.000667231 +[default0]: dataset 16, input: 0.000359281, achieved: 0.000359326 +[default0]: dataset 17, input: 0.000508443, achieved: 0.000508519 +[default0]: dataset 18, input: 0.00211373, achieved: 0.0021138 +[default0]: dataset 19, input: 0.000912995, achieved: 0.000912961 +[default0]: dataset 20, input: 0.00124543, achieved: 0.00124546 +[default0]: dataset 21, input: 0.000315887, achieved: 0.00031594 +[default0]: dataset 22, input: 0.0813721, achieved: 0.0813721 +[default0]: dataset 23, input: 0.0552939, achieved: 0.0552939 +[default0]: dataset 24, input: 0.0495415, achieved: 0.0495414 +[default0]: dataset 25, input: 0.0246164, achieved: 0.0246163 +[default0]: dataset 26, input: 0.120917, achieved: 0.120917 +[default0]: dataset 27, input: 0.000517703, achieved: 0.000517666 +[default0]:> elapsed time for building blendable dataset indices: 0.32 (sec) +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013175 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [29920425, 31495184) total of 1574759 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011527 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004044 seconds +[default0]: number of documents: 31495184 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]:Skipping sample id=1205756. Maximum sequence length: 2049, sample length: 3095 +[default0]:Skipping sample id=1088047. Maximum sequence length: 2049, sample length: 3102 +[default0]:Skipping sample id=405751. Maximum sequence length: 2049, sample length: 2285 +[default0]:Skipping sample id=1095715. Maximum sequence length: 2049, sample length: 2874 +[default0]:Skipping sample id=1481701. Maximum sequence length: 2049, sample length: 2490 +[default0]:Skipping sample id=493107. Maximum sequence length: 2049, sample length: 3588 +[default0]:Skipping sample id=729322. Maximum sequence length: 2049, sample length: 2224 +[default0]:Skipping sample id=1274543. Maximum sequence length: 2049, sample length: 4011 +[default0]:Skipping sample id=1401311. Maximum sequence length: 2049, sample length: 2840 +[default0]:Skipping sample id=473870. Maximum sequence length: 2049, sample length: 4606 +[default0]:Skipping sample id=115560. Maximum sequence length: 2049, sample length: 2392 +[default0]:Skipping sample id=984513. Maximum sequence length: 2049, sample length: 2155 +[default0]:Skipping sample id=200554. Maximum sequence length: 2049, sample length: 2200 +[default0]:Skipping sample id=711052. Maximum sequence length: 2049, sample length: 2184 +[default0]:Skipping sample id=810105. Maximum sequence length: 2049, sample length: 3340 +[default0]:Skipping sample id=149319. Maximum sequence length: 2049, sample length: 4243 +[default0]:Skipping sample id=472965. Maximum sequence length: 2049, sample length: 5238 +[default0]:Skipping sample id=869937. Maximum sequence length: 2049, sample length: 2475 +[default0]:Skipping sample id=91880. Maximum sequence length: 2049, sample length: 2461 +[default0]:Skipping sample id=1458607. Maximum sequence length: 2049, sample length: 2848 +[default0]:Skipping sample id=1028504. Maximum sequence length: 2049, sample length: 3031 +[default0]:Skipping sample id=112363. Maximum sequence length: 2049, sample length: 2502 +[default0]:Skipping sample id=189195. Maximum sequence length: 2049, sample length: 2610 +[default0]:Skipping sample id=1367212. Maximum sequence length: 2049, sample length: 2702 +[default0]:Skipping sample id=1364987. Maximum sequence length: 2049, sample length: 2902 +[default0]:Skipping sample id=874041. Maximum sequence length: 2049, sample length: 2530 +[default0]:Skipping sample id=1511736. Maximum sequence length: 2049, sample length: 3002 +[default0]:Skipping sample id=74700. Maximum sequence length: 2049, sample length: 2437 +[default0]:Skipping sample id=425705. Maximum sequence length: 2049, sample length: 6730 +[default0]:Skipping sample id=341888. Maximum sequence length: 2049, sample length: 2084 +[default0]:Skipping sample id=477076. Maximum sequence length: 2049, sample length: 2241 +[default0]:Skipping sample id=252639. Maximum sequence length: 2049, sample length: 2842 +[default0]:Skipping sample id=1245248. Maximum sequence length: 2049, sample length: 3276 +[default0]:Skipping sample id=1570339. Maximum sequence length: 2049, sample length: 5103 +[default0]:Skipping sample id=161523. Maximum sequence length: 2049, sample length: 3483 +[default0]:Skipping sample id=607568. Maximum sequence length: 2049, sample length: 2346 +[default0]:Skipping sample id=569820. Maximum sequence length: 2049, sample length: 4178 +[default0]:Skipping sample id=758246. Maximum sequence length: 2049, sample length: 3062 +[default0]:Skipping sample id=1214833. Maximum sequence length: 2049, sample length: 3731 +[default0]:Skipping sample id=1186563. Maximum sequence length: 2049, sample length: 2299 +[default0]:Skipping sample id=1043079. Maximum sequence length: 2049, sample length: 3781 +[default0]:Skipping sample id=128696. Maximum sequence length: 2049, sample length: 2933 +[default0]:Skipping sample id=873078. Maximum sequence length: 2049, sample length: 3031 +[default0]:Skipping sample id=925919. Maximum sequence length: 2049, sample length: 2769 +[default0]:Skipping sample id=1458363. Maximum sequence length: 2049, sample length: 3798 +[default0]:Skipping sample id=1550246. Maximum sequence length: 2049, sample length: 2653 +[default0]:Skipping sample id=662551. Maximum sequence length: 2049, sample length: 2712 +[default0]:Skipping sample id=72957. Maximum sequence length: 2049, sample length: 2113 +[default0]:Skipping sample id=158460. Maximum sequence length: 2049, sample length: 2890 +[default0]:Skipping sample id=1348714. Maximum sequence length: 2049, sample length: 2159 +[default0]:Skipping sample id=801668. Maximum sequence length: 2049, sample length: 2181 +[default0]:Skipping sample id=1038315. Maximum sequence length: 2049, sample length: 2261 +[default0]:Skipping sample id=1126622. Maximum sequence length: 2049, sample length: 2094 +[default0]:Skipping sample id=253368. Maximum sequence length: 2049, sample length: 2300 +[default0]:Skipping sample id=1243151. Maximum sequence length: 2049, sample length: 2930 +[default0]:Skipping sample id=722767. Maximum sequence length: 2049, sample length: 2935 +[default0]:Skipping sample id=676730. Maximum sequence length: 2049, sample length: 4307 +[default0]:Skipping sample id=637686. Maximum sequence length: 2049, sample length: 2799 +[default0]:Skipping sample id=378656. Maximum sequence length: 2049, sample length: 5243 +[default0]:Skipping sample id=782609. Maximum sequence length: 2049, sample length: 2185 +[default0]:Skipping sample id=58138. Maximum sequence length: 2049, sample length: 2274 +[default0]:Skipping sample id=1086307. Maximum sequence length: 2049, sample length: 2229 +[default0]:Skipping sample id=1563253. Maximum sequence length: 2049, sample length: 2451 +[default0]:Skipping sample id=541074. Maximum sequence length: 2049, sample length: 2843 +[default0]:Skipping sample id=50101. Maximum sequence length: 2049, sample length: 2206 +[default0]:Skipping sample id=629169. Maximum sequence length: 2049, sample length: 2348 +[default0]:Skipping sample id=1499501. Maximum sequence length: 2049, sample length: 2541 +[default0]:Skipping sample id=181189. Maximum sequence length: 2049, sample length: 2540 +[default0]:Skipping sample id=785568. Maximum sequence length: 2049, sample length: 2090 +[default0]:Skipping sample id=1465172. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=118536. Maximum sequence length: 2049, sample length: 2062 +[default0]:Skipping sample id=1417526. Maximum sequence length: 2049, sample length: 5838 +[default0]:Skipping sample id=1544415. Maximum sequence length: 2049, sample length: 2565 +[default0]:Skipping sample id=1370503. Maximum sequence length: 2049, sample length: 3207 +[default0]:Skipping sample id=1111892. Maximum sequence length: 2049, sample length: 2464 +[default0]:Skipping sample id=791163. Maximum sequence length: 2049, sample length: 3523 +[default0]:Skipping sample id=1034602. Maximum sequence length: 2049, sample length: 2149 +[default0]:Skipping sample id=1297712. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=1517204. Maximum sequence length: 2049, sample length: 2382 +[default0]:Skipping sample id=616294. Maximum sequence length: 2049, sample length: 3407 +[default0]:Skipping sample id=507708. Maximum sequence length: 2049, sample length: 3246 +[default0]:Skipping sample id=1321542. Maximum sequence length: 2049, sample length: 2168 +[default0]:Skipping sample id=935552. Maximum sequence length: 2049, sample length: 3141 +[default0]:Skipping sample id=448223. Maximum sequence length: 2049, sample length: 2193 +[default0]:Skipping sample id=566173. Maximum sequence length: 2049, sample length: 2105 +[default0]:Skipping sample id=305460. Maximum sequence length: 2049, sample length: 2403 +[default0]:Skipping sample id=1499105. Maximum sequence length: 2049, sample length: 2554 +[default0]:Skipping sample id=1250653. Maximum sequence length: 2049, sample length: 2269 +[default0]:Skipping sample id=646830. Maximum sequence length: 2049, sample length: 2616 +[default0]:Skipping sample id=491374. Maximum sequence length: 2049, sample length: 2608 +[default0]:Skipping sample id=1410793. Maximum sequence length: 2049, sample length: 3380 +[default0]:Skipping sample id=724899. Maximum sequence length: 2049, sample length: 2121 +[default0]:Skipping sample id=798515. Maximum sequence length: 2049, sample length: 2890 +[default0]:Skipping sample id=427583. Maximum sequence length: 2049, sample length: 2742 +[default0]:Skipping sample id=588617. Maximum sequence length: 2049, sample length: 2162 +[default0]:Skipping sample id=870826. Maximum sequence length: 2049, sample length: 3306 +[default0]:Skipping sample id=151350. Maximum sequence length: 2049, sample length: 2229 +[default0]:Skipping sample id=1144628. Maximum sequence length: 2049, sample length: 3388 +[default0]:Skipping sample id=1264633. Maximum sequence length: 2049, sample length: 3026 +[default0]:Skipping sample id=1561185. Maximum sequence length: 2049, sample length: 2305 +[default0]:Skipping sample id=354860. Maximum sequence length: 2049, sample length: 2553 +[default0]:Skipping sample id=783009. Maximum sequence length: 2049, sample length: 3840 +[default0]:Skipping sample id=680045. Maximum sequence length: 2049, sample length: 2631 +[default0]:Skipping sample id=138712. Maximum sequence length: 2049, sample length: 2159 +[default0]:Skipping sample id=1460263. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=684838. Maximum sequence length: 2049, sample length: 5701 +[default0]:Skipping sample id=1423390. Maximum sequence length: 2049, sample length: 2467 +[default0]:Skipping sample id=560997. Maximum sequence length: 2049, sample length: 2396 +[default0]:Skipping sample id=510794. Maximum sequence length: 2049, sample length: 3819 +[default0]:Skipping sample id=876794. Maximum sequence length: 2049, sample length: 4527 +[default0]:Skipping sample id=829976. Maximum sequence length: 2049, sample length: 2221 +[default0]:Skipping sample id=385521. Maximum sequence length: 2049, sample length: 2573 +[default0]:Skipping sample id=336668. Maximum sequence length: 2049, sample length: 3787 +[default0]:Skipping sample id=895148. Maximum sequence length: 2049, sample length: 2209 +[default0]:Skipping sample id=727923. Maximum sequence length: 2049, sample length: 3123 +[default0]:Skipping sample id=353227. Maximum sequence length: 2049, sample length: 2073 +[default0]:Skipping sample id=194462. Maximum sequence length: 2049, sample length: 2315 +[default0]:Skipping sample id=1251669. Maximum sequence length: 2049, sample length: 2107 +[default0]:Skipping sample id=413469. Maximum sequence length: 2049, sample length: 2241 +[default0]:Skipping sample id=590250. Maximum sequence length: 2049, sample length: 3344 +[default0]:Skipping sample id=877247. Maximum sequence length: 2049, sample length: 2480 +[default0]:Skipping sample id=402132. Maximum sequence length: 2049, sample length: 2074 +[default0]:Skipping sample id=1358807. Maximum sequence length: 2049, sample length: 2378 +[default0]:Skipping sample id=1461163. Maximum sequence length: 2049, sample length: 2723 +[default0]:Skipping sample id=495331. Maximum sequence length: 2049, sample length: 2211 +[default0]:Skipping sample id=871250. Maximum sequence length: 2049, sample length: 2203 +[default0]:Skipping sample id=7433. Maximum sequence length: 2049, sample length: 3476 +[default0]:Skipping sample id=340366. Maximum sequence length: 2049, sample length: 3492 +[default0]:Skipping sample id=614116. Maximum sequence length: 2049, sample length: 2774 +[default0]:Skipping sample id=263363. Maximum sequence length: 2049, sample length: 2849 +[default0]:Skipping sample id=1415012. Maximum sequence length: 2049, sample length: 2815 +[default0]:Skipping sample id=613541. Maximum sequence length: 2049, sample length: 2541 +[default0]:Skipping sample id=1074231. Maximum sequence length: 2049, sample length: 2077 +[default0]:Skipping sample id=114857. Maximum sequence length: 2049, sample length: 2428 +[default0]:Skipping sample id=1362779. Maximum sequence length: 2049, sample length: 2799 +[default0]:Skipping sample id=354491. Maximum sequence length: 2049, sample length: 3631 +[default0]:Skipping sample id=279272. Maximum sequence length: 2049, sample length: 2613 +[default0]:Skipping sample id=469924. Maximum sequence length: 2049, sample length: 2515 +[default0]:Skipping sample id=611252. Maximum sequence length: 2049, sample length: 2316 +[default0]:Skipping sample id=941780. Maximum sequence length: 2049, sample length: 2501 +[default0]:Skipping sample id=576608. Maximum sequence length: 2049, sample length: 2462 +[default0]:Skipping sample id=1139694. Maximum sequence length: 2049, sample length: 2454 +[default0]:Skipping sample id=177366. Maximum sequence length: 2049, sample length: 5090 +[default0]:Skipping sample id=143327. Maximum sequence length: 2049, sample length: 3582 +[default0]:Skipping sample id=840686. Maximum sequence length: 2049, sample length: 3260 +[default0]:Skipping sample id=373237. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=206627. Maximum sequence length: 2049, sample length: 2592 +[default0]:Skipping sample id=1043866. Maximum sequence length: 2049, sample length: 2343 +[default0]:Skipping sample id=363692. Maximum sequence length: 2049, sample length: 2743 +[default0]:Skipping sample id=145295. Maximum sequence length: 2049, sample length: 3003 +[default0]:Skipping sample id=464120. Maximum sequence length: 2049, sample length: 2288 +[default0]:Skipping sample id=313148. Maximum sequence length: 2049, sample length: 4615 +[default0]:Skipping sample id=331372. Maximum sequence length: 2049, sample length: 2536 +[default0]:Skipping sample id=10174. Maximum sequence length: 2049, sample length: 2625 +[default0]:Skipping sample id=884362. Maximum sequence length: 2049, sample length: 2547 +[default0]:Skipping sample id=1424871. Maximum sequence length: 2049, sample length: 2597 +[default0]:Skipping sample id=453682. Maximum sequence length: 2049, sample length: 2801 +[default0]:Skipping sample id=710900. Maximum sequence length: 2049, sample length: 2426 +[default0]:Skipping sample id=186807. Maximum sequence length: 2049, sample length: 3012 +[default0]:Skipping sample id=232192. Maximum sequence length: 2049, sample length: 3020 +[default0]:Skipping sample id=710284. Maximum sequence length: 2049, sample length: 2109 +[default0]:Skipping sample id=623084. Maximum sequence length: 2049, sample length: 3008 +[default0]:Skipping sample id=448477. Maximum sequence length: 2049, sample length: 2323 +[default0]:Skipping sample id=1232565. Maximum sequence length: 2049, sample length: 3809 +[default0]:Skipping sample id=321994. Maximum sequence length: 2049, sample length: 2793 +[default0]:Skipping sample id=1283000. Maximum sequence length: 2049, sample length: 2087 +[default0]:Skipping sample id=454275. Maximum sequence length: 2049, sample length: 2725 +[default0]:Skipping sample id=1556366. Maximum sequence length: 2049, sample length: 5097 +[default0]:Skipping sample id=874710. Maximum sequence length: 2049, sample length: 2127 +[default0]:Skipping sample id=817062. Maximum sequence length: 2049, sample length: 3787 +[default0]:Skipping sample id=978792. Maximum sequence length: 2049, sample length: 4499 +[default0]:Skipping sample id=530725. Maximum sequence length: 2049, sample length: 2559 +[default0]:Skipping sample id=1432198. Maximum sequence length: 2049, sample length: 2222 +[default0]:Skipping sample id=1350159. Maximum sequence length: 2049, sample length: 3284 +[default0]:Skipping sample id=533062. Maximum sequence length: 2049, sample length: 3921 +[default0]:Skipping sample id=34251. Maximum sequence length: 2049, sample length: 2760 +[default0]:Skipping sample id=1283422. Maximum sequence length: 2049, sample length: 2423 +[default0]:Skipping sample id=100075. Maximum sequence length: 2049, sample length: 2407 +[default0]:Skipping sample id=467017. Maximum sequence length: 2049, sample length: 2137 +[default0]:Skipping sample id=1484568. Maximum sequence length: 2049, sample length: 2352 +[default0]:Skipping sample id=593422. Maximum sequence length: 2049, sample length: 4047 +[default0]:Skipping sample id=1310672. Maximum sequence length: 2049, sample length: 3237 +[default0]:Skipping sample id=462464. Maximum sequence length: 2049, sample length: 3718 +[default0]:Skipping sample id=830484. Maximum sequence length: 2049, sample length: 2181 +[default0]:Skipping sample id=156469. Maximum sequence length: 2049, sample length: 4666 +[default0]:Skipping sample id=708965. Maximum sequence length: 2049, sample length: 2298 +[default0]:Skipping sample id=245532. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=901635. Maximum sequence length: 2049, sample length: 2443 +[default0]:Skipping sample id=1556398. Maximum sequence length: 2049, sample length: 2263 +[default0]:Skipping sample id=236209. Maximum sequence length: 2049, sample length: 3179 +[default0]:Skipping sample id=520065. Maximum sequence length: 2049, sample length: 4144 +[default0]:Skipping sample id=477077. Maximum sequence length: 2049, sample length: 2617 +[default0]:Skipping sample id=658676. Maximum sequence length: 2049, sample length: 2522 +[default0]:Skipping sample id=395312. Maximum sequence length: 2049, sample length: 3081 +[default0]:Skipping sample id=1502675. Maximum sequence length: 2049, sample length: 2991 +[default0]:Skipping sample id=1487574. Maximum sequence length: 2049, sample length: 3659 +[default0]:Skipping sample id=1071847. Maximum sequence length: 2049, sample length: 2483 +[default0]:Skipping sample id=874132. Maximum sequence length: 2049, sample length: 2405 +[default0]:Skipping sample id=1100837. Maximum sequence length: 2049, sample length: 2119 +[default0]:Skipping sample id=887979. Maximum sequence length: 2049, sample length: 2618 +[default0]:Skipping sample id=1259410. Maximum sequence length: 2049, sample length: 2147 +[default0]:Skipping sample id=754305. Maximum sequence length: 2049, sample length: 3293 +[default0]:Skipping sample id=807828. Maximum sequence length: 2049, sample length: 2242 +[default0]:Skipping sample id=419687. Maximum sequence length: 2049, sample length: 2537 +[default0]:Skipping sample id=851364. Maximum sequence length: 2049, sample length: 2307 +[default0]:Skipping sample id=35589. Maximum sequence length: 2049, sample length: 2143 +[default0]:Skipping sample id=258949. Maximum sequence length: 2049, sample length: 2329 +[default0]:Skipping sample id=175461. Maximum sequence length: 2049, sample length: 2193 +[default0]:Skipping sample id=246578. Maximum sequence length: 2049, sample length: 2150 +[default0]:Skipping sample id=1097252. Maximum sequence length: 2049, sample length: 2164 +[default0]:Skipping sample id=995591. Maximum sequence length: 2049, sample length: 2479 +[default0]:Skipping sample id=854788. Maximum sequence length: 2049, sample length: 2407 +[default0]:Skipping sample id=1158328. Maximum sequence length: 2049, sample length: 2658 +[default0]:Skipping sample id=763508. Maximum sequence length: 2049, sample length: 2549 +[default0]:Skipping sample id=1475670. Maximum sequence length: 2049, sample length: 2177 +[default0]:Skipping sample id=1279089. Maximum sequence length: 2049, sample length: 2061 +[default0]:Skipping sample id=850360. Maximum sequence length: 2049, sample length: 2323 +[default0]:Skipping sample id=903780. Maximum sequence length: 2049, sample length: 2776 +[default0]:Skipping sample id=1068832. Maximum sequence length: 2049, sample length: 3150 +[default0]:Skipping sample id=132000. Maximum sequence length: 2049, sample length: 2784 +[default0]:Skipping sample id=140935. Maximum sequence length: 2049, sample length: 2134 +[default0]:Skipping sample id=1431425. Maximum sequence length: 2049, sample length: 2429 +[default0]:Skipping sample id=1020915. Maximum sequence length: 2049, sample length: 3195 +[default0]:Skipping sample id=241538. Maximum sequence length: 2049, sample length: 4060 +[default0]:Skipping sample id=177756. Maximum sequence length: 2049, sample length: 2924 +[default0]:Skipping sample id=903106. Maximum sequence length: 2049, sample length: 3714 +[default0]:Skipping sample id=1360997. Maximum sequence length: 2049, sample length: 2996 +[default0]:Skipping sample id=123910. Maximum sequence length: 2049, sample length: 2139 +[default0]:Skipping sample id=481950. Maximum sequence length: 2049, sample length: 3159 +[default0]:Skipping sample id=1512202. Maximum sequence length: 2049, sample length: 3810 +[default0]:Skipping sample id=1305695. Maximum sequence length: 2049, sample length: 2782 +[default0]:Skipping sample id=1340450. Maximum sequence length: 2049, sample length: 3701 +[default0]:Skipping sample id=827922. Maximum sequence length: 2049, sample length: 2741 +[default0]:Skipping sample id=1508333. Maximum sequence length: 2049, sample length: 2244 +[default0]:Skipping sample id=648439. Maximum sequence length: 2049, sample length: 2094 +[default0]:Skipping sample id=930834. Maximum sequence length: 2049, sample length: 3098 +[default0]:Skipping sample id=221225. Maximum sequence length: 2049, sample length: 4210 +[default0]:Skipping sample id=1441187. Maximum sequence length: 2049, sample length: 2329 +[default0]:Skipping sample id=720925. Maximum sequence length: 2049, sample length: 3848 +[default0]:Skipping sample id=940158. Maximum sequence length: 2049, sample length: 2652 +[default0]:Skipping sample id=871168. Maximum sequence length: 2049, sample length: 3827 +[default0]:Skipping sample id=111857. Maximum sequence length: 2049, sample length: 4339 +[default0]:Skipping sample id=924786. Maximum sequence length: 2049, sample length: 2595 +[default0]:Skipping sample id=87351. Maximum sequence length: 2049, sample length: 5533 +[default0]:Skipping sample id=325627. Maximum sequence length: 2049, sample length: 2102 +[default0]:Skipping sample id=1053130. Maximum sequence length: 2049, sample length: 2808 +[default0]:Skipping sample id=1424035. Maximum sequence length: 2049, sample length: 2470 +[default0]:Skipping sample id=1510093. Maximum sequence length: 2049, sample length: 2581 +[default0]:Skipping sample id=1376231. Maximum sequence length: 2049, sample length: 3266 +[default0]:Skipping sample id=848741. Maximum sequence length: 2049, sample length: 4191 +[default0]:Skipping sample id=775613. Maximum sequence length: 2049, sample length: 2669 +[default0]:Skipping sample id=280566. Maximum sequence length: 2049, sample length: 4336 +[default0]:Skipping sample id=88757. Maximum sequence length: 2049, sample length: 3699 +[default0]:Skipping sample id=245297. Maximum sequence length: 2049, sample length: 3508 +[default0]:Skipping sample id=121793. Maximum sequence length: 2049, sample length: 2911 +[default0]:Skipping sample id=369238. Maximum sequence length: 2049, sample length: 2568 +[default0]:Skipping sample id=1356707. Maximum sequence length: 2049, sample length: 4249 +[default0]:Skipping sample id=1421561. Maximum sequence length: 2049, sample length: 2088 +[default0]:Skipping sample id=383124. Maximum sequence length: 2049, sample length: 2168 +[default0]:Skipping sample id=793352. Maximum sequence length: 2049, sample length: 2666 +[default0]:Skipping sample id=543905. Maximum sequence length: 2049, sample length: 3374 +[default0]:Skipping sample id=820828. Maximum sequence length: 2049, sample length: 3397 +[default0]:Skipping sample id=1185360. Maximum sequence length: 2049, sample length: 3424 +[default0]:Skipping sample id=1410374. Maximum sequence length: 2049, sample length: 2444 +[default0]:Skipping sample id=516135. Maximum sequence length: 2049, sample length: 2299 +[default0]:Skipping sample id=403943. Maximum sequence length: 2049, sample length: 3474 +[default0]:Skipping sample id=1548973. Maximum sequence length: 2049, sample length: 2211 +[default0]:Skipping sample id=555871. Maximum sequence length: 2049, sample length: 2761 +[default0]:Skipping sample id=607028. Maximum sequence length: 2049, sample length: 6933 +[default0]:Skipping sample id=1410981. Maximum sequence length: 2049, sample length: 2102 +[default0]:Skipping sample id=1270880. Maximum sequence length: 2049, sample length: 2269 +[default0]:Skipping sample id=672037. Maximum sequence length: 2049, sample length: 2808 +[default0]:Skipping sample id=342313. Maximum sequence length: 2049, sample length: 3492 +[default0]:Skipping sample id=910054. Maximum sequence length: 2049, sample length: 2118 +[default0]:Skipping sample id=634351. Maximum sequence length: 2049, sample length: 3376 +[default0]:Skipping sample id=143876. Maximum sequence length: 2049, sample length: 4180 +[default0]:Skipping sample id=1224134. Maximum sequence length: 2049, sample length: 3652 +[default0]:Skipping sample id=1523527. Maximum sequence length: 2049, sample length: 3893 +[default0]:Skipping sample id=97763. Maximum sequence length: 2049, sample length: 4201 +[default0]:Skipping sample id=1151476. Maximum sequence length: 2049, sample length: 4670 +[default0]:Skipping sample id=956938. Maximum sequence length: 2049, sample length: 2295 +[default0]:Skipping sample id=175960. Maximum sequence length: 2049, sample length: 2233 +[default0]:Skipping sample id=1449436. Maximum sequence length: 2049, sample length: 4372 +[default0]:Skipping sample id=1319673. Maximum sequence length: 2049, sample length: 2349 +[default0]:Skipping sample id=436156. Maximum sequence length: 2049, sample length: 2341 +[default0]:Skipping sample id=1155785. Maximum sequence length: 2049, sample length: 3840 +[default0]:Skipping sample id=997810. Maximum sequence length: 2049, sample length: 2205 +[default0]:Skipping sample id=1143398. Maximum sequence length: 2049, sample length: 3557 +[default0]:Skipping sample id=481019. Maximum sequence length: 2049, sample length: 3226 +[default0]:Skipping sample id=1440388. Maximum sequence length: 2049, sample length: 2770 +[default0]:Skipping sample id=347174. Maximum sequence length: 2049, sample length: 2671 +[default0]:Skipping sample id=1302748. Maximum sequence length: 2049, sample length: 4813 +[default0]:Skipping sample id=398079. Maximum sequence length: 2049, sample length: 3017 +[default0]:Skipping sample id=907298. Maximum sequence length: 2049, sample length: 2581 +[default0]:Skipping sample id=1546908. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=1459019. Maximum sequence length: 2049, sample length: 2171 +[default0]:Skipping sample id=440653. Maximum sequence length: 2049, sample length: 2981 +[default0]:Skipping sample id=1180757. Maximum sequence length: 2049, sample length: 2222 +[default0]:Skipping sample id=1533132. Maximum sequence length: 2049, sample length: 2653 +[default0]:Skipping sample id=1256327. Maximum sequence length: 2049, sample length: 2073 +[default0]:Skipping sample id=1116039. Maximum sequence length: 2049, sample length: 2280 +[default0]:Skipping sample id=848728. Maximum sequence length: 2049, sample length: 2387 +[default0]:Skipping sample id=1282715. Maximum sequence length: 2049, sample length: 4146 +[default0]:Skipping sample id=1471643. Maximum sequence length: 2049, sample length: 3161 +[default0]:Skipping sample id=798976. Maximum sequence length: 2049, sample length: 3574 +[default0]:Skipping sample id=1030708. Maximum sequence length: 2049, sample length: 2129 +[default0]:Skipping sample id=1539509. Maximum sequence length: 2049, sample length: 2621 +[default0]:Skipping sample id=1411921. Maximum sequence length: 2049, sample length: 3600 +[default0]:Skipping sample id=809647. Maximum sequence length: 2049, sample length: 3086 +[default0]:Skipping sample id=1255999. Maximum sequence length: 2049, sample length: 2079 +[default0]:Skipping sample id=254897. Maximum sequence length: 2049, sample length: 2493 +[default0]:Skipping sample id=459680. Maximum sequence length: 2049, sample length: 3559 +[default0]:Skipping sample id=958268. Maximum sequence length: 2049, sample length: 2644 +[default0]:Skipping sample id=1467210. Maximum sequence length: 2049, sample length: 2767 +[default0]:Skipping sample id=90728. Maximum sequence length: 2049, sample length: 2068 +[default0]:Skipping sample id=220549. Maximum sequence length: 2049, sample length: 2884 +[default0]:Skipping sample id=168186. Maximum sequence length: 2049, sample length: 2849 +[default0]:Skipping sample id=54523. Maximum sequence length: 2049, sample length: 3420 +[default0]:Skipping sample id=101253. Maximum sequence length: 2049, sample length: 3347 +[default0]:Skipping sample id=946783. Maximum sequence length: 2049, sample length: 2954 +[default0]:Skipping sample id=663774. Maximum sequence length: 2049, sample length: 2269 +[default0]:Skipping sample id=1359435. Maximum sequence length: 2049, sample length: 2167 +[default0]:Skipping sample id=970321. Maximum sequence length: 2049, sample length: 2103 +[default0]:Skipping sample id=564434. Maximum sequence length: 2049, sample length: 4078 +[default0]:Skipping sample id=390144. Maximum sequence length: 2049, sample length: 2213 +[default0]:Skipping sample id=933328. Maximum sequence length: 2049, sample length: 2619 +[default0]:Skipping sample id=391706. Maximum sequence length: 2049, sample length: 3272 +[default0]:Skipping sample id=761568. Maximum sequence length: 2049, sample length: 2339 +[default0]:Skipping sample id=1010144. Maximum sequence length: 2049, sample length: 3982 +[default0]:Skipping sample id=1210562. Maximum sequence length: 2049, sample length: 2568 +[default0]:Skipping sample id=113301. Maximum sequence length: 2049, sample length: 2160 +[default0]:Skipping sample id=1224100. Maximum sequence length: 2049, sample length: 3196 +[default0]:Skipping sample id=742950. Maximum sequence length: 2049, sample length: 2706 +[default0]:Skipping sample id=35116. Maximum sequence length: 2049, sample length: 3650 +[default0]:Skipping sample id=1112691. Maximum sequence length: 2049, sample length: 2262 +[default0]:Skipping sample id=1178360. Maximum sequence length: 2049, sample length: 2603 +[default0]:Skipping sample id=1266460. Maximum sequence length: 2049, sample length: 5199 +[default0]:Skipping sample id=275566. Maximum sequence length: 2049, sample length: 2636 +[default0]:Skipping sample id=604362. Maximum sequence length: 2049, sample length: 2288 +[default0]:Skipping sample id=561030. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=12347. Maximum sequence length: 2049, sample length: 2700 +[default0]:Skipping sample id=814770. Maximum sequence length: 2049, sample length: 4411 +[default0]:Skipping sample id=868075. Maximum sequence length: 2049, sample length: 2294 +[default0]:Skipping sample id=74343. Maximum sequence length: 2049, sample length: 3130 +[default0]:Skipping sample id=1287546. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=361338. Maximum sequence length: 2049, sample length: 2789 +[default0]:Skipping sample id=1072600. Maximum sequence length: 2049, sample length: 2217 +[default0]:Skipping sample id=90011. Maximum sequence length: 2049, sample length: 6525 +[default0]:Skipping sample id=771785. Maximum sequence length: 2049, sample length: 2819 +[default0]:Skipping sample id=835640. Maximum sequence length: 2049, sample length: 2981 +[default0]:Skipping sample id=1299912. Maximum sequence length: 2049, sample length: 2271 +[default0]:Skipping sample id=1454902. Maximum sequence length: 2049, sample length: 2226 +[default0]:Skipping sample id=1321608. Maximum sequence length: 2049, sample length: 2961 +[default0]:Skipping sample id=408601. Maximum sequence length: 2049, sample length: 2338 +[default0]:Skipping sample id=739128. Maximum sequence length: 2049, sample length: 3112 +[default0]:Skipping sample id=787667. Maximum sequence length: 2049, sample length: 2947 +[default0]:Skipping sample id=749463. Maximum sequence length: 2049, sample length: 2338 +[default0]:Skipping sample id=1217785. Maximum sequence length: 2049, sample length: 3329 +[default0]:Skipping sample id=432757. Maximum sequence length: 2049, sample length: 3324 +[default0]:Skipping sample id=532602. Maximum sequence length: 2049, sample length: 2987 +[default0]:Skipping sample id=1530753. Maximum sequence length: 2049, sample length: 2982 +[default0]:Skipping sample id=26813. Maximum sequence length: 2049, sample length: 3284 +[default0]:Skipping sample id=13736. Maximum sequence length: 2049, sample length: 2787 +[default0]:Skipping sample id=391843. Maximum sequence length: 2049, sample length: 2052 +[default0]:Skipping sample id=225768. Maximum sequence length: 2049, sample length: 2539 +[default0]:Skipping sample id=1388477. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=1458779. Maximum sequence length: 2049, sample length: 2550 +[default0]:Skipping sample id=1322550. Maximum sequence length: 2049, sample length: 4228 +[default0]:Skipping sample id=1288084. Maximum sequence length: 2049, sample length: 2913 +[default0]:Skipping sample id=30239. Maximum sequence length: 2049, sample length: 2210 +[default0]:Skipping sample id=233303. Maximum sequence length: 2049, sample length: 2209 +[default0]:Skipping sample id=237168. Maximum sequence length: 2049, sample length: 2200 +[default0]:Skipping sample id=757392. Maximum sequence length: 2049, sample length: 3630 +[default0]:Skipping sample id=1492163. Maximum sequence length: 2049, sample length: 2908 +[default0]:Skipping sample id=371434. Maximum sequence length: 2049, sample length: 2698 +[default0]:Skipping sample id=779159. Maximum sequence length: 2049, sample length: 2922 +[default0]:Skipping sample id=1148962. Maximum sequence length: 2049, sample length: 2201 +[default0]:Skipping sample id=209575. Maximum sequence length: 2049, sample length: 3646 +[default0]:Skipping sample id=646169. Maximum sequence length: 2049, sample length: 2094 +[default0]:Skipping sample id=286128. Maximum sequence length: 2049, sample length: 2435 +[default0]:Skipping sample id=1302678. Maximum sequence length: 2049, sample length: 2260 +[default0]:Skipping sample id=1324788. Maximum sequence length: 2049, sample length: 2191 +[default0]:Skipping sample id=207082. Maximum sequence length: 2049, sample length: 4147 +[default0]:Skipping sample id=1335796. Maximum sequence length: 2049, sample length: 2561 +[default0]:Skipping sample id=425199. Maximum sequence length: 2049, sample length: 2231 +[default0]:Skipping sample id=103699. Maximum sequence length: 2049, sample length: 2144 +[default0]:Skipping sample id=398804. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=334633. Maximum sequence length: 2049, sample length: 2591 +[default0]:Skipping sample id=1486874. Maximum sequence length: 2049, sample length: 3129 +[default0]:Skipping sample id=346218. Maximum sequence length: 2049, sample length: 2336 +[default0]:Skipping sample id=1479435. Maximum sequence length: 2049, sample length: 2256 +[default0]:Skipping sample id=478503. Maximum sequence length: 2049, sample length: 4239 +[default0]:Skipping sample id=490415. Maximum sequence length: 2049, sample length: 4847 +[default0]:Skipping sample id=1529351. Maximum sequence length: 2049, sample length: 5573 +[default0]:Skipping sample id=1144038. Maximum sequence length: 2049, sample length: 5032 +[default0]:Skipping sample id=403089. Maximum sequence length: 2049, sample length: 2347 +[default0]:Skipping sample id=293051. Maximum sequence length: 2049, sample length: 3270 +[default0]:Skipping sample id=204621. Maximum sequence length: 2049, sample length: 2512 +[default0]:Skipping sample id=1329088. Maximum sequence length: 2049, sample length: 2342 +[default0]:Skipping sample id=1209768. Maximum sequence length: 2049, sample length: 3832 +[default0]:Skipping sample id=969168. Maximum sequence length: 2049, sample length: 2173 +[default0]:Skipping sample id=1190569. Maximum sequence length: 2049, sample length: 2189 +[default0]:Skipping sample id=1185284. Maximum sequence length: 2049, sample length: 2221 +[default0]:Skipping sample id=87123. Maximum sequence length: 2049, sample length: 2456 +[default0]:Skipping sample id=234322. Maximum sequence length: 2049, sample length: 2778 +[default0]:Skipping sample id=1243947. Maximum sequence length: 2049, sample length: 2253 +[default0]:Skipping sample id=307020. Maximum sequence length: 2049, sample length: 2085 +[default0]:Skipping sample id=1398286. Maximum sequence length: 2049, sample length: 2406 +[default0]:Skipping sample id=712006. Maximum sequence length: 2049, sample length: 3596 +[default0]:Skipping sample id=927181. Maximum sequence length: 2049, sample length: 2320 +[default0]:Skipping sample id=1100004. Maximum sequence length: 2049, sample length: 2529 +[default0]:Skipping sample id=1354198. Maximum sequence length: 2049, sample length: 2537 +[default0]:Skipping sample id=1489687. Maximum sequence length: 2049, sample length: 3440 +[default0]:Skipping sample id=349718. Maximum sequence length: 2049, sample length: 3468 +[default0]:Skipping sample id=1431011. Maximum sequence length: 2049, sample length: 3323 +[default0]:Skipping sample id=1518896. Maximum sequence length: 2049, sample length: 5246 +[default0]:Skipping sample id=1041836. Maximum sequence length: 2049, sample length: 2341 +[default0]:Skipping sample id=270697. Maximum sequence length: 2049, sample length: 2309 +[default0]:Skipping sample id=1326285. Maximum sequence length: 2049, sample length: 2181 +[default0]:Skipping sample id=17619. Maximum sequence length: 2049, sample length: 2687 +[default0]:Skipping sample id=1396713. Maximum sequence length: 2049, sample length: 4634 +[default0]:Skipping sample id=88070. Maximum sequence length: 2049, sample length: 2084 +[default0]:Skipping sample id=1228207. Maximum sequence length: 2049, sample length: 2609 +[default0]:Skipping sample id=673499. Maximum sequence length: 2049, sample length: 3632 +[default0]:Skipping sample id=290491. Maximum sequence length: 2049, sample length: 4424 +[default0]:Skipping sample id=1263812. Maximum sequence length: 2049, sample length: 3378 +[default0]:Skipping sample id=832533. Maximum sequence length: 2049, sample length: 2246 +[default0]:Skipping sample id=1164666. Maximum sequence length: 2049, sample length: 3332 +[default0]:Skipping sample id=953958. Maximum sequence length: 2049, sample length: 2158 +[default0]:Skipping sample id=465306. Maximum sequence length: 2049, sample length: 3214 +[default0]:Skipping sample id=1184365. Maximum sequence length: 2049, sample length: 2796 +[default0]:Skipping sample id=274388. Maximum sequence length: 2049, sample length: 2311 +[default0]:Skipping sample id=769106. Maximum sequence length: 2049, sample length: 3388 +[default0]:Skipping sample id=1112117. Maximum sequence length: 2049, sample length: 2832 +[default0]:Skipping sample id=49451. Maximum sequence length: 2049, sample length: 4707 +[default0]:Skipping sample id=1489772. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=839703. Maximum sequence length: 2049, sample length: 2125 +[default0]:Skipping sample id=1296296. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=1337783. Maximum sequence length: 2049, sample length: 3685 +[default0]:Skipping sample id=618287. Maximum sequence length: 2049, sample length: 3010 +[default0]:Skipping sample id=1162788. Maximum sequence length: 2049, sample length: 2217 +[default0]:Skipping sample id=997117. Maximum sequence length: 2049, sample length: 2123 +[default0]:Skipping sample id=421386. Maximum sequence length: 2049, sample length: 3719 +[default0]:Skipping sample id=944944. Maximum sequence length: 2049, sample length: 2483 +[default0]:Skipping sample id=855356. Maximum sequence length: 2049, sample length: 2801 +[default0]:Skipping sample id=487691. Maximum sequence length: 2049, sample length: 3239 +[default0]:Skipping sample id=536870. Maximum sequence length: 2049, sample length: 3781 +[default0]:Skipping sample id=305066. Maximum sequence length: 2049, sample length: 2271 +[default0]:Skipping sample id=980427. Maximum sequence length: 2049, sample length: 2093 +[default0]:Skipping sample id=981572. Maximum sequence length: 2049, sample length: 4087 +[default0]:Skipping sample id=1385114. Maximum sequence length: 2049, sample length: 2619 +[default0]:Skipping sample id=554968. Maximum sequence length: 2049, sample length: 2936 +[default0]:Skipping sample id=1485638. Maximum sequence length: 2049, sample length: 3363 +[default0]:Skipping sample id=233216. Maximum sequence length: 2049, sample length: 2473 +[default0]:Skipping sample id=83476. Maximum sequence length: 2049, sample length: 2450 +[default0]:Skipping sample id=523801. Maximum sequence length: 2049, sample length: 2330 +[default0]:Skipping sample id=478285. Maximum sequence length: 2049, sample length: 4745 +[default0]:Skipping sample id=499057. Maximum sequence length: 2049, sample length: 2420 +[default0]:Skipping sample id=1461383. Maximum sequence length: 2049, sample length: 2306 +[default0]:Skipping sample id=1377794. Maximum sequence length: 2049, sample length: 3168 +[default0]:Skipping sample id=986522. Maximum sequence length: 2049, sample length: 13949 +[default0]:Skipping sample id=979349. Maximum sequence length: 2049, sample length: 2834 +[default0]:Skipping sample id=1009355. Maximum sequence length: 2049, sample length: 2641 +[default0]:Skipping sample id=449851. Maximum sequence length: 2049, sample length: 2134 +[default0]:Skipping sample id=417496. Maximum sequence length: 2049, sample length: 2572 +[default0]:Skipping sample id=810743. Maximum sequence length: 2049, sample length: 2367 +[default0]:Skipping sample id=197598. Maximum sequence length: 2049, sample length: 2852 +[default0]:Skipping sample id=1082535. Maximum sequence length: 2049, sample length: 2376 +[default0]:Skipping sample id=919716. Maximum sequence length: 2049, sample length: 3359 +[default0]:Skipping sample id=1521659. Maximum sequence length: 2049, sample length: 2208 +[default0]:Skipping sample id=1286385. Maximum sequence length: 2049, sample length: 4225 +[default0]:Skipping sample id=1160504. Maximum sequence length: 2049, sample length: 2770 +[default0]:Skipping sample id=1436543. Maximum sequence length: 2049, sample length: 5009 +[default0]:Skipping sample id=724951. Maximum sequence length: 2049, sample length: 3605 +[default0]:Skipping sample id=1086119. Maximum sequence length: 2049, sample length: 2089 +[default0]:Skipping sample id=1042454. Maximum sequence length: 2049, sample length: 2825 +[default0]:Skipping sample id=329745. Maximum sequence length: 2049, sample length: 2834 +[default0]:Skipping sample id=972354. Maximum sequence length: 2049, sample length: 2060 +[default0]:Skipping sample id=1194595. Maximum sequence length: 2049, sample length: 3853 +[default0]:Skipping sample id=845625. Maximum sequence length: 2049, sample length: 4033 +[default0]:Skipping sample id=1327736. Maximum sequence length: 2049, sample length: 2421 +[default0]:Skipping sample id=376796. Maximum sequence length: 2049, sample length: 6275 +[default0]:Skipping sample id=735826. Maximum sequence length: 2049, sample length: 3846 +[default0]:Skipping sample id=588965. Maximum sequence length: 2049, sample length: 3101 +[default0]:Skipping sample id=820557. Maximum sequence length: 2049, sample length: 2817 +[default0]:Skipping sample id=1468550. Maximum sequence length: 2049, sample length: 2122 +[default0]:Skipping sample id=945616. Maximum sequence length: 2049, sample length: 2475 +[default0]:Skipping sample id=499973. Maximum sequence length: 2049, sample length: 4086 +[default0]:Skipping sample id=943036. Maximum sequence length: 2049, sample length: 2739 +[default0]:Skipping sample id=1001445. Maximum sequence length: 2049, sample length: 4624 +[default0]:Skipping sample id=873195. Maximum sequence length: 2049, sample length: 3112 +[default0]:Skipping sample id=1517777. Maximum sequence length: 2049, sample length: 2276 +[default0]:Skipping sample id=326125. Maximum sequence length: 2049, sample length: 3136 +[default0]:Skipping sample id=847579. Maximum sequence length: 2049, sample length: 4266 +[default0]:Skipping sample id=49924. Maximum sequence length: 2049, sample length: 2279 +[default0]:Skipping sample id=1343212. Maximum sequence length: 2049, sample length: 2358 +[default0]:Skipping sample id=529614. Maximum sequence length: 2049, sample length: 2213 +[default0]:Skipping sample id=1363197. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=1214866. Maximum sequence length: 2049, sample length: 2366 +[default0]:Skipping sample id=1178510. Maximum sequence length: 2049, sample length: 2305 +[default0]:Skipping sample id=310953. Maximum sequence length: 2049, sample length: 5625 +[default0]:Skipping sample id=1313210. Maximum sequence length: 2049, sample length: 2360 +[default0]:Skipping sample id=103113. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=139666. Maximum sequence length: 2049, sample length: 4247 +[default0]:Skipping sample id=177872. Maximum sequence length: 2049, sample length: 2920 +[default0]:Skipping sample id=104506. Maximum sequence length: 2049, sample length: 2879 +[default0]:Skipping sample id=309772. Maximum sequence length: 2049, sample length: 3210 +[default0]:Skipping sample id=1355767. Maximum sequence length: 2049, sample length: 2459 +[default0]:Skipping sample id=468061. Maximum sequence length: 2049, sample length: 2394 +[default0]:Skipping sample id=695266. Maximum sequence length: 2049, sample length: 3951 +[default0]:Skipping sample id=1159707. Maximum sequence length: 2049, sample length: 2920 +[default0]:Skipping sample id=90542. Maximum sequence length: 2049, sample length: 3689 +[default0]:Skipping sample id=1371113. Maximum sequence length: 2049, sample length: 2402 +[default0]:Skipping sample id=661274. Maximum sequence length: 2049, sample length: 3589 +[default0]:Skipping sample id=537265. Maximum sequence length: 2049, sample length: 6060 +[default0]:Skipping sample id=633542. Maximum sequence length: 2049, sample length: 2152 +[default0]:Skipping sample id=985162. Maximum sequence length: 2049, sample length: 2192 +[default0]:Skipping sample id=724032. Maximum sequence length: 2049, sample length: 2098 +[default0]:Skipping sample id=472939. Maximum sequence length: 2049, sample length: 2316 +[default0]:Skipping sample id=1152359. Maximum sequence length: 2049, sample length: 2284 +[default0]:Skipping sample id=326811. Maximum sequence length: 2049, sample length: 2302 +[default0]:Skipping sample id=163340. Maximum sequence length: 2049, sample length: 2515 +[default0]:Skipping sample id=187519. Maximum sequence length: 2049, sample length: 2605 +[default0]:Skipping sample id=1065909. Maximum sequence length: 2049, sample length: 2192 +[default0]:Skipping sample id=942076. Maximum sequence length: 2049, sample length: 2730 +[default0]:Skipping sample id=59660. Maximum sequence length: 2049, sample length: 2062 +[default0]:Skipping sample id=273391. Maximum sequence length: 2049, sample length: 3717 +[default0]:Skipping sample id=1133926. Maximum sequence length: 2049, sample length: 4103 +[default0]:Skipping sample id=356052. Maximum sequence length: 2049, sample length: 2328 +[default0]:Skipping sample id=527591. Maximum sequence length: 2049, sample length: 2538 +[default0]:Skipping sample id=1351164. Maximum sequence length: 2049, sample length: 2114 +[default0]:Skipping sample id=1311242. Maximum sequence length: 2049, sample length: 2395 +[default0]:Skipping sample id=1094757. Maximum sequence length: 2049, sample length: 2140 +[default0]:Skipping sample id=25565. Maximum sequence length: 2049, sample length: 4249 +[default0]:Skipping sample id=1277828. Maximum sequence length: 2049, sample length: 2514 +[default0]:Skipping sample id=363735. Maximum sequence length: 2049, sample length: 4252 +[default0]:Skipping sample id=622340. Maximum sequence length: 2049, sample length: 3463 +[default0]:Skipping sample id=1383682. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=725447. Maximum sequence length: 2049, sample length: 2687 +[default0]:Skipping sample id=1177578. Maximum sequence length: 2049, sample length: 2186 +[default0]:Skipping sample id=294540. Maximum sequence length: 2049, sample length: 3532 +[default0]:Skipping sample id=161556. Maximum sequence length: 2049, sample length: 3795 +[default0]:Skipping sample id=1461563. Maximum sequence length: 2049, sample length: 3133 +[default0]:Skipping sample id=271709. Maximum sequence length: 2049, sample length: 3865 +[default0]:Skipping sample id=1484300. Maximum sequence length: 2049, sample length: 2403 +[default0]:Skipping sample id=444885. Maximum sequence length: 2049, sample length: 2881 +[default0]:Skipping sample id=1132954. Maximum sequence length: 2049, sample length: 2474 +[default0]:Skipping sample id=757625. Maximum sequence length: 2049, sample length: 3190 +[default0]:Skipping sample id=1407502. Maximum sequence length: 2049, sample length: 2111 +[default0]:Skipping sample id=669841. Maximum sequence length: 2049, sample length: 3477 +[default0]:Skipping sample id=1385577. Maximum sequence length: 2049, sample length: 2273 +[default0]:Skipping sample id=93235. Maximum sequence length: 2049, sample length: 2775 +[default0]:Skipping sample id=1054203. Maximum sequence length: 2049, sample length: 2074 +[default0]:Skipping sample id=611460. Maximum sequence length: 2049, sample length: 2189 +[default0]:Skipping sample id=367934. Maximum sequence length: 2049, sample length: 2157 +[default0]:Skipping sample id=784944. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=172715. Maximum sequence length: 2049, sample length: 2442 +[default0]:Skipping sample id=1149435. Maximum sequence length: 2049, sample length: 2542 +[default0]:Skipping sample id=212266. Maximum sequence length: 2049, sample length: 3227 +[default0]:Skipping sample id=1530146. Maximum sequence length: 2049, sample length: 2067 +[default0]:Skipping sample id=1164419. Maximum sequence length: 2049, sample length: 2920 +[default0]:Skipping sample id=1269690. Maximum sequence length: 2049, sample length: 3186 +[default0]:Skipping sample id=1240774. Maximum sequence length: 2049, sample length: 4342 +[default0]:Skipping sample id=1380159. Maximum sequence length: 2049, sample length: 2465 +[default0]:Skipping sample id=734982. Maximum sequence length: 2049, sample length: 3182 +[default0]:Skipping sample id=156738. Maximum sequence length: 2049, sample length: 3114 +[default0]:Skipping sample id=911976. Maximum sequence length: 2049, sample length: 3002 +[default0]:Skipping sample id=1533019. Maximum sequence length: 2049, sample length: 2778 +[default0]:Skipping sample id=1445630. Maximum sequence length: 2049, sample length: 3669 +[default0]:Skipping sample id=1274362. Maximum sequence length: 2049, sample length: 3547 +[default0]:Skipping sample id=1173407. Maximum sequence length: 2049, sample length: 2934 +[default0]:Skipping sample id=924579. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=72171. Maximum sequence length: 2049, sample length: 2583 +[default0]:Skipping sample id=543629. Maximum sequence length: 2049, sample length: 2961 +[default0]:Skipping sample id=1313311. Maximum sequence length: 2049, sample length: 3371 +[default0]:Skipping sample id=373385. Maximum sequence length: 2049, sample length: 2981 +[default0]:Skipping sample id=1123366. Maximum sequence length: 2049, sample length: 4459 +[default0]:Skipping sample id=370341. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=1505369. Maximum sequence length: 2049, sample length: 2152 +[default0]:Skipping sample id=389425. Maximum sequence length: 2049, sample length: 2113 +[default0]:Skipping sample id=1288424. Maximum sequence length: 2049, sample length: 2052 +[default0]:Skipping sample id=319224. Maximum sequence length: 2049, sample length: 2394 +[default0]:Skipping sample id=379579. Maximum sequence length: 2049, sample length: 2886 +[default0]:Skipping sample id=628351. Maximum sequence length: 2049, sample length: 3130 +[default0]:Skipping sample id=992797. Maximum sequence length: 2049, sample length: 2637 +[default0]:Skipping sample id=210157. Maximum sequence length: 2049, sample length: 3414 +[default0]:Skipping sample id=1100084. Maximum sequence length: 2049, sample length: 3625 +[default0]:Skipping sample id=601640. Maximum sequence length: 2049, sample length: 2222 +[default0]:Skipping sample id=1525491. Maximum sequence length: 2049, sample length: 2578 +[default0]:Skipping sample id=706152. Maximum sequence length: 2049, sample length: 2053 +[default0]:Skipping sample id=1312325. Maximum sequence length: 2049, sample length: 2780 +[default0]:Skipping sample id=1484370. Maximum sequence length: 2049, sample length: 2746 +[default0]:Skipping sample id=1438695. Maximum sequence length: 2049, sample length: 4431 +[default0]:Skipping sample id=154494. Maximum sequence length: 2049, sample length: 2543 +[default0]:Skipping sample id=373956. Maximum sequence length: 2049, sample length: 2849 +[default0]:Skipping sample id=334374. Maximum sequence length: 2049, sample length: 2068 +[default0]:Skipping sample id=1073172. Maximum sequence length: 2049, sample length: 2662 +[default0]:Skipping sample id=1021253. Maximum sequence length: 2049, sample length: 2567 +[default0]:Skipping sample id=535254. Maximum sequence length: 2049, sample length: 3248 +[default0]:Skipping sample id=851421. Maximum sequence length: 2049, sample length: 4881 +[default0]:Skipping sample id=920156. Maximum sequence length: 2049, sample length: 2472 +[default0]:Skipping sample id=1398200. Maximum sequence length: 2049, sample length: 3589 +[default0]:Skipping sample id=299297. Maximum sequence length: 2049, sample length: 2350 +[default0]:Skipping sample id=1213198. Maximum sequence length: 2049, sample length: 4111 +[default0]:Skipping sample id=1278751. Maximum sequence length: 2049, sample length: 2663 +[default0]:Skipping sample id=1198460. Maximum sequence length: 2049, sample length: 2759 +[default0]:Skipping sample id=522726. Maximum sequence length: 2049, sample length: 2845 +[default0]:Skipping sample id=771906. Maximum sequence length: 2049, sample length: 2094 +[default0]:Skipping sample id=84952. Maximum sequence length: 2049, sample length: 2099 +[default0]:Skipping sample id=209872. Maximum sequence length: 2049, sample length: 3648 +[default0]:Skipping sample id=531958. Maximum sequence length: 2049, sample length: 2304 +[default0]:Skipping sample id=572589. Maximum sequence length: 2049, sample length: 3112 +[default0]:Skipping sample id=521092. Maximum sequence length: 2049, sample length: 2318 +[default0]:Skipping sample id=139606. Maximum sequence length: 2049, sample length: 3494 +[default0]:Skipping sample id=777215. Maximum sequence length: 2049, sample length: 2829 +[default0]:Skipping sample id=1039539. Maximum sequence length: 2049, sample length: 2378 +[default0]:Skipping sample id=637059. Maximum sequence length: 2049, sample length: 2155 +[default0]:Skipping sample id=172457. Maximum sequence length: 2049, sample length: 2440 +[default0]:Skipping sample id=58130. Maximum sequence length: 2049, sample length: 2801 +[default0]:Skipping sample id=562532. Maximum sequence length: 2049, sample length: 3367 +[default0]:Skipping sample id=1550676. Maximum sequence length: 2049, sample length: 2348 +[default0]:Skipping sample id=1112536. Maximum sequence length: 2049, sample length: 2352 +[default0]:Skipping sample id=384020. Maximum sequence length: 2049, sample length: 2299 +[default0]:Skipping sample id=724544. Maximum sequence length: 2049, sample length: 4106 +[default0]:Skipping sample id=1371856. Maximum sequence length: 2049, sample length: 2837 +[default0]:Skipping sample id=195404. Maximum sequence length: 2049, sample length: 2157 +[default0]:Skipping sample id=857517. Maximum sequence length: 2049, sample length: 3395 +[default0]:Skipping sample id=423403. Maximum sequence length: 2049, sample length: 4311 +[default0]:Skipping sample id=474026. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=1282711. Maximum sequence length: 2049, sample length: 2571 +[default0]:Skipping sample id=681336. Maximum sequence length: 2049, sample length: 2507 +[default0]:Skipping sample id=73394. Maximum sequence length: 2049, sample length: 2996 +[default0]:Skipping sample id=1529627. Maximum sequence length: 2049, sample length: 3022 +[default0]:Skipping sample id=913624. Maximum sequence length: 2049, sample length: 2394 +[default0]:Skipping sample id=867883. Maximum sequence length: 2049, sample length: 2905 +[default0]:Skipping sample id=1549162. Maximum sequence length: 2049, sample length: 4098 +[default0]:Skipping sample id=159724. Maximum sequence length: 2049, sample length: 2061 +[default0]:Skipping sample id=172329. Maximum sequence length: 2049, sample length: 3559 +[default0]:Skipping sample id=231460. Maximum sequence length: 2049, sample length: 3622 +[default0]:Skipping sample id=25044. Maximum sequence length: 2049, sample length: 3324 +[default0]:Skipping sample id=805984. Maximum sequence length: 2049, sample length: 2285 +[default0]:Skipping sample id=165494. Maximum sequence length: 2049, sample length: 2087 +[default0]:Skipping sample id=984984. Maximum sequence length: 2049, sample length: 2902 +[default0]:Skipping sample id=1399736. Maximum sequence length: 2049, sample length: 2066 +[default0]:Skipping sample id=1516835. Maximum sequence length: 2049, sample length: 2692 +[default0]:Skipping sample id=576337. Maximum sequence length: 2049, sample length: 2093 +[default0]:Skipping sample id=232567. Maximum sequence length: 2049, sample length: 2364 +[default0]:Skipping sample id=1161967. Maximum sequence length: 2049, sample length: 3370 +[default0]:Skipping sample id=229459. Maximum sequence length: 2049, sample length: 3307 +[default0]:Skipping sample id=564398. Maximum sequence length: 2049, sample length: 3391 +[default0]:Skipping sample id=232309. Maximum sequence length: 2049, sample length: 5242 +[default0]:Skipping sample id=206701. Maximum sequence length: 2049, sample length: 3784 +[default0]:Skipping sample id=70602. Maximum sequence length: 2049, sample length: 2385 +[default0]:Skipping sample id=819701. Maximum sequence length: 2049, sample length: 2830 +[default0]:Skipping sample id=332845. Maximum sequence length: 2049, sample length: 2217 +[default0]:Skipping sample id=610168. Maximum sequence length: 2049, sample length: 2844 +[default0]:Skipping sample id=314732. Maximum sequence length: 2049, sample length: 3262 +[default0]:Skipping sample id=1062749. Maximum sequence length: 2049, sample length: 3192 +[default0]:Skipping sample id=1369055. Maximum sequence length: 2049, sample length: 2574 +[default0]:Skipping sample id=906928. Maximum sequence length: 2049, sample length: 2493 +[default0]:Skipping sample id=123354. Maximum sequence length: 2049, sample length: 2510 +[default0]:Skipping sample id=655500. Maximum sequence length: 2049, sample length: 5743 +[default0]:Skipping sample id=745268. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=322919. Maximum sequence length: 2049, sample length: 2761 +[default0]:Skipping sample id=1479364. Maximum sequence length: 2049, sample length: 2371 +[default0]:Skipping sample id=1085835. Maximum sequence length: 2049, sample length: 3828 +[default0]:Skipping sample id=1485009. Maximum sequence length: 2049, sample length: 6090 +[default0]:Skipping sample id=581443. Maximum sequence length: 2049, sample length: 3433 +[default0]:Skipping sample id=746239. Maximum sequence length: 2049, sample length: 2249 +[default0]:Skipping sample id=234692. Maximum sequence length: 2049, sample length: 2393 +[default0]:Skipping sample id=764473. Maximum sequence length: 2049, sample length: 2948 +[default0]:Skipping sample id=97990. Maximum sequence length: 2049, sample length: 5666 +[default0]:Skipping sample id=1179508. Maximum sequence length: 2049, sample length: 2773 +[default0]:Skipping sample id=144684. Maximum sequence length: 2049, sample length: 2520 +[default0]:Skipping sample id=72129. Maximum sequence length: 2049, sample length: 7100 +[default0]:Skipping sample id=1380097. Maximum sequence length: 2049, sample length: 2336 +[default0]:Skipping sample id=1490890. Maximum sequence length: 2049, sample length: 2157 +[default0]:Skipping sample id=1030211. Maximum sequence length: 2049, sample length: 3718 +[default0]:Skipping sample id=1323169. Maximum sequence length: 2049, sample length: 6384 +[default0]:Skipping sample id=1269869. Maximum sequence length: 2049, sample length: 3332 +[default0]:Skipping sample id=375054. Maximum sequence length: 2049, sample length: 2996 +[default0]:Skipping sample id=1571388. Maximum sequence length: 2049, sample length: 5818 +[default0]:Skipping sample id=983856. Maximum sequence length: 2049, sample length: 2568 +[default0]:Skipping sample id=814974. Maximum sequence length: 2049, sample length: 3585 +[default0]:Skipping sample id=1516943. Maximum sequence length: 2049, sample length: 2462 +[default0]:Skipping sample id=805207. Maximum sequence length: 2049, sample length: 2689 +[default0]:Skipping sample id=70590. Maximum sequence length: 2049, sample length: 2927 +[default0]:Skipping sample id=361893. Maximum sequence length: 2049, sample length: 2591 +[default0]:Skipping sample id=208948. Maximum sequence length: 2049, sample length: 2483 +[default0]:Skipping sample id=838573. Maximum sequence length: 2049, sample length: 2162 +[default0]:Skipping sample id=755657. Maximum sequence length: 2049, sample length: 2268 +[default0]:Skipping sample id=708919. Maximum sequence length: 2049, sample length: 2760 +[default0]:Skipping sample id=472364. Maximum sequence length: 2049, sample length: 2411 +[default0]:Skipping sample id=738826. Maximum sequence length: 2049, sample length: 5226 +[default0]:Skipping sample id=379713. Maximum sequence length: 2049, sample length: 2384 +[default0]:Skipping sample id=402056. Maximum sequence length: 2049, sample length: 2602 +[default0]:Skipping sample id=1151944. Maximum sequence length: 2049, sample length: 2230 +[default0]:Skipping sample id=60257. Maximum sequence length: 2049, sample length: 2371 +[default0]:Skipping sample id=58944. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=1430212. Maximum sequence length: 2049, sample length: 2938 +[default0]:Skipping sample id=1230640. Maximum sequence length: 2049, sample length: 2852 +[default0]:Skipping sample id=1282488. Maximum sequence length: 2049, sample length: 2719 +[default0]:Skipping sample id=177232. Maximum sequence length: 2049, sample length: 2333 +[default0]:Skipping sample id=94285. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=1176797. Maximum sequence length: 2049, sample length: 3624 +[default0]:Skipping sample id=804966. Maximum sequence length: 2049, sample length: 2645 +[default0]:Skipping sample id=264800. Maximum sequence length: 2049, sample length: 3564 +[default0]:Skipping sample id=904371. Maximum sequence length: 2049, sample length: 2184 +[default0]:Skipping sample id=1221216. Maximum sequence length: 2049, sample length: 3334 +[default0]:Skipping sample id=1511522. Maximum sequence length: 2049, sample length: 2313 +[default0]:Skipping sample id=1037491. Maximum sequence length: 2049, sample length: 2438 +[default0]:Skipping sample id=918279. Maximum sequence length: 2049, sample length: 3123 +[default0]:Skipping sample id=6683. Maximum sequence length: 2049, sample length: 2953 +[default0]:Skipping sample id=166320. Maximum sequence length: 2049, sample length: 2741 +[default0]:Skipping sample id=695860. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=414881. Maximum sequence length: 2049, sample length: 2334 +[default0]:Skipping sample id=1546627. Maximum sequence length: 2049, sample length: 2330 +[default0]:Skipping sample id=174608. Maximum sequence length: 2049, sample length: 2528 +[default0]:Skipping sample id=983622. Maximum sequence length: 2049, sample length: 2561 +[default0]:Skipping sample id=1008015. Maximum sequence length: 2049, sample length: 3991 +[default0]:Skipping sample id=719605. Maximum sequence length: 2049, sample length: 2110 +[default0]:Skipping sample id=712140. Maximum sequence length: 2049, sample length: 2945 +[default0]:Skipping sample id=130217. Maximum sequence length: 2049, sample length: 2265 +[default0]:Skipping sample id=109006. Maximum sequence length: 2049, sample length: 4181 +[default0]:Skipping sample id=1080564. Maximum sequence length: 2049, sample length: 3353 +[default0]:Skipping sample id=733845. Maximum sequence length: 2049, sample length: 2815 +[default0]:Skipping sample id=658224. Maximum sequence length: 2049, sample length: 2267 +[default0]:Skipping sample id=372592. Maximum sequence length: 2049, sample length: 3701 +[default0]:Skipping sample id=242654. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=795749. Maximum sequence length: 2049, sample length: 2186 +[default0]:Skipping sample id=1187778. Maximum sequence length: 2049, sample length: 2124 +[default0]:Skipping sample id=312153. Maximum sequence length: 2049, sample length: 3122 +[default0]:Skipping sample id=1107573. Maximum sequence length: 2049, sample length: 2390 +[default0]:Skipping sample id=1244771. Maximum sequence length: 2049, sample length: 4700 +[default0]:Skipping sample id=1202297. Maximum sequence length: 2049, sample length: 4677 +[default0]:Skipping sample id=1336764. Maximum sequence length: 2049, sample length: 3955 +[default0]:Skipping sample id=61687. Maximum sequence length: 2049, sample length: 2182 +[default0]:Skipping sample id=61906. Maximum sequence length: 2049, sample length: 3347 +[default0]:Skipping sample id=592800. Maximum sequence length: 2049, sample length: 4301 +[default0]:Skipping sample id=476723. Maximum sequence length: 2049, sample length: 3370 +[default0]:Skipping sample id=833624. Maximum sequence length: 2049, sample length: 2392 +[default0]:Skipping sample id=1303301. Maximum sequence length: 2049, sample length: 3285 +[default0]:Skipping sample id=759582. Maximum sequence length: 2049, sample length: 2498 +[default0]:Skipping sample id=1033621. Maximum sequence length: 2049, sample length: 3473 +[default0]:Skipping sample id=442016. Maximum sequence length: 2049, sample length: 3191 +[default0]:Skipping sample id=573330. Maximum sequence length: 2049, sample length: 2368 +[default0]:Skipping sample id=31431. Maximum sequence length: 2049, sample length: 3099 +[default0]:Skipping sample id=1183079. Maximum sequence length: 2049, sample length: 2105 +[default0]:Skipping sample id=217556. Maximum sequence length: 2049, sample length: 2530 +[default0]:Skipping sample id=1531844. Maximum sequence length: 2049, sample length: 2742 +[default0]:Skipping sample id=340508. Maximum sequence length: 2049, sample length: 2085 +[default0]:Skipping sample id=219666. Maximum sequence length: 2049, sample length: 2721 +[default0]:Skipping sample id=970724. Maximum sequence length: 2049, sample length: 2999 +[default0]:Skipping sample id=1123649. Maximum sequence length: 2049, sample length: 2634 +[default0]:Skipping sample id=849378. Maximum sequence length: 2049, sample length: 3411 +[default0]:Skipping sample id=179508. Maximum sequence length: 2049, sample length: 3254 +[default0]:Skipping sample id=1130039. Maximum sequence length: 2049, sample length: 2195 +[default0]:Skipping sample id=803457. Maximum sequence length: 2049, sample length: 3211 +[default0]:Skipping sample id=696050. Maximum sequence length: 2049, sample length: 2215 +[default0]:Skipping sample id=1048261. Maximum sequence length: 2049, sample length: 2611 +[default0]:Skipping sample id=711416. Maximum sequence length: 2049, sample length: 4190 +[default0]:Skipping sample id=1366097. Maximum sequence length: 2049, sample length: 2437 +[default0]:Skipping sample id=1127306. Maximum sequence length: 2049, sample length: 4819 +[default0]:Skipping sample id=1017599. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=1472182. Maximum sequence length: 2049, sample length: 2169 +[default0]:Skipping sample id=265110. Maximum sequence length: 2049, sample length: 2985 +[default0]:Skipping sample id=608060. Maximum sequence length: 2049, sample length: 3377 +[default0]:Skipping sample id=1245577. Maximum sequence length: 2049, sample length: 2097 +[default0]:Skipping sample id=483754. Maximum sequence length: 2049, sample length: 2136 +[default0]:Skipping sample id=1465711. Maximum sequence length: 2049, sample length: 6104 +[default0]:Skipping sample id=539526. Maximum sequence length: 2049, sample length: 2496 +[default0]:Skipping sample id=1306424. Maximum sequence length: 2049, sample length: 3604 +[default0]:Skipping sample id=873932. Maximum sequence length: 2049, sample length: 2341 +[default0]:Skipping sample id=30477. Maximum sequence length: 2049, sample length: 5774 +[default0]:Skipping sample id=549123. Maximum sequence length: 2049, sample length: 2503 +[default0]:Skipping sample id=1231650. Maximum sequence length: 2049, sample length: 2885 +[default0]:Skipping sample id=952339. Maximum sequence length: 2049, sample length: 3246 +[default0]:Skipping sample id=729006. Maximum sequence length: 2049, sample length: 2091 +[default0]:Skipping sample id=453313. Maximum sequence length: 2049, sample length: 2164 +[default0]:Skipping sample id=113475. Maximum sequence length: 2049, sample length: 2355 +[default0]:Skipping sample id=233212. Maximum sequence length: 2049, sample length: 3223 +[default0]:Skipping sample id=951577. Maximum sequence length: 2049, sample length: 2565 +[default0]:Skipping sample id=807521. Maximum sequence length: 2049, sample length: 4149 +[default0]:Skipping sample id=1219442. Maximum sequence length: 2049, sample length: 3867 +[default0]:Skipping sample id=273045. Maximum sequence length: 2049, sample length: 5011 +[default0]:Skipping sample id=545599. Maximum sequence length: 2049, sample length: 2785 +[default0]:Skipping sample id=1219121. Maximum sequence length: 2049, sample length: 3471 +[default0]:Skipping sample id=284858. Maximum sequence length: 2049, sample length: 2056 +[default0]:Skipping sample id=498677. Maximum sequence length: 2049, sample length: 2899 +[default0]:Skipping sample id=690287. Maximum sequence length: 2049, sample length: 2609 +[default0]:Skipping sample id=1198474. Maximum sequence length: 2049, sample length: 2452 +[default0]:Skipping sample id=1281402. Maximum sequence length: 2049, sample length: 2872 +[default0]:Skipping sample id=1402860. Maximum sequence length: 2049, sample length: 3175 +[default0]:Skipping sample id=1495560. Maximum sequence length: 2049, sample length: 3102 +[default0]:Skipping sample id=513464. Maximum sequence length: 2049, sample length: 3805 +[default0]:Skipping sample id=1365095. Maximum sequence length: 2049, sample length: 2683 +[default0]:Skipping sample id=1410840. Maximum sequence length: 2049, sample length: 2873 +[default0]:Skipping sample id=1523487. Maximum sequence length: 2049, sample length: 3216 +[default0]:Skipping sample id=624053. Maximum sequence length: 2049, sample length: 2321 +[default0]:Skipping sample id=817750. Maximum sequence length: 2049, sample length: 2579 +[default0]:Skipping sample id=907023. Maximum sequence length: 2049, sample length: 3063 +[default0]:Skipping sample id=806395. Maximum sequence length: 2049, sample length: 3296 +[default0]:Skipping sample id=272053. Maximum sequence length: 2049, sample length: 2189 +[default0]:Skipping sample id=1241848. Maximum sequence length: 2049, sample length: 4957 +[default0]:Skipping sample id=206581. Maximum sequence length: 2049, sample length: 3129 +[default0]:Skipping sample id=430009. Maximum sequence length: 2049, sample length: 2907 +[default0]:Skipping sample id=928363. Maximum sequence length: 2049, sample length: 2423 +[default0]:Skipping sample id=1519174. Maximum sequence length: 2049, sample length: 3603 +[default0]:Skipping sample id=1407089. Maximum sequence length: 2049, sample length: 2261 +[default0]:Skipping sample id=1237013. Maximum sequence length: 2049, sample length: 4266 +[default0]:Skipping sample id=750555. Maximum sequence length: 2049, sample length: 2941 +[default0]:Skipping sample id=1093690. Maximum sequence length: 2049, sample length: 3251 +[default0]:Skipping sample id=1401220. Maximum sequence length: 2049, sample length: 2126 +[default0]:Skipping sample id=938956. Maximum sequence length: 2049, sample length: 2318 +[default0]:Skipping sample id=459954. Maximum sequence length: 2049, sample length: 2262 +[default0]:Skipping sample id=177. Maximum sequence length: 2049, sample length: 2069 +[default0]:Skipping sample id=607. Maximum sequence length: 2049, sample length: 2668 +[default0]:Skipping sample id=359945. Maximum sequence length: 2049, sample length: 2199 +[default0]:Skipping sample id=353454. Maximum sequence length: 2049, sample length: 2764 +[default0]:Skipping sample id=1063817. Maximum sequence length: 2049, sample length: 2296 +[default0]:Skipping sample id=708393. Maximum sequence length: 2049, sample length: 3025 +[default0]:Skipping sample id=242653. Maximum sequence length: 2049, sample length: 2229 +[default0]:Skipping sample id=619157. Maximum sequence length: 2049, sample length: 2872 +[default0]:Skipping sample id=1154486. Maximum sequence length: 2049, sample length: 2062 +[default0]:Skipping sample id=1401012. Maximum sequence length: 2049, sample length: 2091 +[default0]:Skipping sample id=211365. Maximum sequence length: 2049, sample length: 3035 +[default0]:Skipping sample id=968060. Maximum sequence length: 2049, sample length: 2648 +[default0]:Skipping sample id=157094. Maximum sequence length: 2049, sample length: 2367 +[default0]:Skipping sample id=1480765. Maximum sequence length: 2049, sample length: 3480 +[default0]:Skipping sample id=836185. Maximum sequence length: 2049, sample length: 2711 +[default0]:Skipping sample id=1498592. Maximum sequence length: 2049, sample length: 2311 +[default0]:Skipping sample id=1072497. Maximum sequence length: 2049, sample length: 2168 +[default0]:Skipping sample id=1254690. Maximum sequence length: 2049, sample length: 2710 +[default0]:Skipping sample id=940294. Maximum sequence length: 2049, sample length: 3076 +[default0]:Skipping sample id=1571634. Maximum sequence length: 2049, sample length: 3442 +[default0]:Skipping sample id=393924. Maximum sequence length: 2049, sample length: 2970 +[default0]:Skipping sample id=535271. Maximum sequence length: 2049, sample length: 2232 +[default0]:Skipping sample id=400806. Maximum sequence length: 2049, sample length: 6553 +[default0]:Skipping sample id=1081248. Maximum sequence length: 2049, sample length: 3021 +[default0]:Skipping sample id=888174. Maximum sequence length: 2049, sample length: 6027 +[default0]:Skipping sample id=495592. Maximum sequence length: 2049, sample length: 2811 +[default0]:Skipping sample id=816051. Maximum sequence length: 2049, sample length: 4350 +[default0]:Skipping sample id=213833. Maximum sequence length: 2049, sample length: 2079 +[default0]:Skipping sample id=1452291. Maximum sequence length: 2049, sample length: 4039 +[default0]:Skipping sample id=92069. Maximum sequence length: 2049, sample length: 2444 +[default0]:Skipping sample id=775308. Maximum sequence length: 2049, sample length: 2375 +[default0]:Skipping sample id=502246. Maximum sequence length: 2049, sample length: 2762 +[default0]:Skipping sample id=518537. Maximum sequence length: 2049, sample length: 2697 +[default0]:Skipping sample id=411786. Maximum sequence length: 2049, sample length: 3221 +[default0]:Skipping sample id=294616. Maximum sequence length: 2049, sample length: 2505 +[default0]:Skipping sample id=1300298. Maximum sequence length: 2049, sample length: 2213 +[default0]:Skipping sample id=347701. Maximum sequence length: 2049, sample length: 2693 +[default0]:Skipping sample id=1436032. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=555677. Maximum sequence length: 2049, sample length: 2636 +[default0]:Skipping sample id=620604. Maximum sequence length: 2049, sample length: 2317 +[default0]:Skipping sample id=546849. Maximum sequence length: 2049, sample length: 2811 +[default0]:Skipping sample id=543399. Maximum sequence length: 2049, sample length: 3369 +[default0]:Skipping sample id=1519498. Maximum sequence length: 2049, sample length: 2715 +[default0]:Skipping sample id=1220632. Maximum sequence length: 2049, sample length: 3398 +[default0]:Skipping sample id=970512. Maximum sequence length: 2049, sample length: 2964 +[default0]:Skipping sample id=88040. Maximum sequence length: 2049, sample length: 2345 +[default0]:Skipping sample id=828142. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=972081. Maximum sequence length: 2049, sample length: 3202 +[default0]:Skipping sample id=194705. Maximum sequence length: 2049, sample length: 4957 +[default0]:Skipping sample id=1059468. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=71805. Maximum sequence length: 2049, sample length: 2292 +[default0]:Skipping sample id=293426. Maximum sequence length: 2049, sample length: 2459 +[default0]:Skipping sample id=685581. Maximum sequence length: 2049, sample length: 3835 +[default0]:Skipping sample id=693207. Maximum sequence length: 2049, sample length: 2073 +[default0]:Skipping sample id=680823. Maximum sequence length: 2049, sample length: 2069 +[default0]:Skipping sample id=1162386. Maximum sequence length: 2049, sample length: 5256 +[default0]:Skipping sample id=613554. Maximum sequence length: 2049, sample length: 2437 +[default0]:Skipping sample id=1320934. Maximum sequence length: 2049, sample length: 2860 +[default0]:Skipping sample id=900318. Maximum sequence length: 2049, sample length: 3442 +[default0]:Skipping sample id=1204912. Maximum sequence length: 2049, sample length: 2390 +[default0]:Skipping sample id=1029475. Maximum sequence length: 2049, sample length: 2195 +[default0]:Skipping sample id=1404161. Maximum sequence length: 2049, sample length: 2985 +[default0]:Skipping sample id=1359510. Maximum sequence length: 2049, sample length: 2406 +[default0]:Skipping sample id=684098. Maximum sequence length: 2049, sample length: 3424 +[default0]:Skipping sample id=301905. Maximum sequence length: 2049, sample length: 2163 +[default0]:Skipping sample id=1069130. Maximum sequence length: 2049, sample length: 2471 +[default0]:Skipping sample id=518688. Maximum sequence length: 2049, sample length: 2554 +[default0]:Skipping sample id=979890. Maximum sequence length: 2049, sample length: 6024 +[default0]:Skipping sample id=986808. Maximum sequence length: 2049, sample length: 2680 +[default0]:Skipping sample id=822745. Maximum sequence length: 2049, sample length: 2081 +[default0]:Skipping sample id=761189. Maximum sequence length: 2049, sample length: 3870 +[default0]:Skipping sample id=1398375. Maximum sequence length: 2049, sample length: 2260 +[default0]:Skipping sample id=1045790. Maximum sequence length: 2049, sample length: 2414 +[default0]:Skipping sample id=1534437. Maximum sequence length: 2049, sample length: 2928 +[default0]:Skipping sample id=191544. Maximum sequence length: 2049, sample length: 2583 +[default0]:Skipping sample id=1420418. Maximum sequence length: 2049, sample length: 5636 +[default0]:Skipping sample id=1385343. Maximum sequence length: 2049, sample length: 2184 +[default0]:Skipping sample id=967952. Maximum sequence length: 2049, sample length: 3437 +[default0]:Skipping sample id=1038552. Maximum sequence length: 2049, sample length: 2668 +[default0]:Skipping sample id=1050884. Maximum sequence length: 2049, sample length: 2882 +[default0]:Skipping sample id=1286158. Maximum sequence length: 2049, sample length: 2763 +[default0]:Skipping sample id=117342. Maximum sequence length: 2049, sample length: 2605 +[default0]:Skipping sample id=1456854. Maximum sequence length: 2049, sample length: 2990 +[default0]:Skipping sample id=198486. Maximum sequence length: 2049, sample length: 3621 +[default0]:Skipping sample id=122469. Maximum sequence length: 2049, sample length: 2469 +[default0]:Skipping sample id=1094450. Maximum sequence length: 2049, sample length: 2846 +[default0]:Skipping sample id=282431. Maximum sequence length: 2049, sample length: 3878 +[default0]:Skipping sample id=1508547. Maximum sequence length: 2049, sample length: 2189 +[default0]:Skipping sample id=5834. Maximum sequence length: 2049, sample length: 3094 +[default0]:Skipping sample id=917908. Maximum sequence length: 2049, sample length: 2100 +[default0]:Skipping sample id=859721. Maximum sequence length: 2049, sample length: 2134 +[default0]:Skipping sample id=598173. Maximum sequence length: 2049, sample length: 2946 +[default0]:Skipping sample id=538542. Maximum sequence length: 2049, sample length: 3278 +[default0]:Skipping sample id=618349. Maximum sequence length: 2049, sample length: 4209 +[default0]:Skipping sample id=184975. Maximum sequence length: 2049, sample length: 4834 +[default0]:Skipping sample id=333363. Maximum sequence length: 2049, sample length: 2693 +[default0]:Skipping sample id=1279685. Maximum sequence length: 2049, sample length: 2564 +[default0]:Skipping sample id=474227. Maximum sequence length: 2049, sample length: 2051 +[default0]:Skipping sample id=35877. Maximum sequence length: 2049, sample length: 2561 +[default0]:Skipping sample id=1175108. Maximum sequence length: 2049, sample length: 2288 +[default0]:Skipping sample id=114215. Maximum sequence length: 2049, sample length: 2064 +[default0]:Skipping sample id=1325843. Maximum sequence length: 2049, sample length: 3662 +[default0]:Skipping sample id=178808. Maximum sequence length: 2049, sample length: 4081 +[default0]:Skipping sample id=1135033. Maximum sequence length: 2049, sample length: 2198 +[default0]:Skipping sample id=1341410. Maximum sequence length: 2049, sample length: 2204 +[default0]:Skipping sample id=1507470. Maximum sequence length: 2049, sample length: 2307 +[default0]:Skipping sample id=154209. Maximum sequence length: 2049, sample length: 5748 +[default0]:Skipping sample id=31867. Maximum sequence length: 2049, sample length: 2643 +[default0]:Skipping sample id=113179. Maximum sequence length: 2049, sample length: 2566 +[default0]:Skipping sample id=1415214. Maximum sequence length: 2049, sample length: 4620 +[default0]:Skipping sample id=1272331. Maximum sequence length: 2049, sample length: 2672 +[default0]:Skipping sample id=988642. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=200110. Maximum sequence length: 2049, sample length: 3184 +[default0]:Skipping sample id=470481. Maximum sequence length: 2049, sample length: 3084 +[default0]:Skipping sample id=108938. Maximum sequence length: 2049, sample length: 2615 +[default0]:Skipping sample id=911722. Maximum sequence length: 2049, sample length: 3188 +[default0]:Skipping sample id=1241193. Maximum sequence length: 2049, sample length: 2711 +[default0]:Skipping sample id=1131859. Maximum sequence length: 2049, sample length: 2294 +[default0]:Skipping sample id=1533204. Maximum sequence length: 2049, sample length: 3050 +[default0]:Skipping sample id=349703. Maximum sequence length: 2049, sample length: 3097 +[default0]:Skipping sample id=1242237. Maximum sequence length: 2049, sample length: 2675 +[default0]:Skipping sample id=1547894. Maximum sequence length: 2049, sample length: 2406 +[default0]:Skipping sample id=1568941. Maximum sequence length: 2049, sample length: 2312 +[default0]:Skipping sample id=1248449. Maximum sequence length: 2049, sample length: 2511 +[default0]:Skipping sample id=205965. Maximum sequence length: 2049, sample length: 2178 +[default0]:Skipping sample id=642044. Maximum sequence length: 2049, sample length: 3394 +[default0]:Skipping sample id=605800. Maximum sequence length: 2049, sample length: 2578 +[default0]:Skipping sample id=679915. Maximum sequence length: 2049, sample length: 2414 +[default0]:Skipping sample id=1562289. Maximum sequence length: 2049, sample length: 2235 +[default0]:Skipping sample id=549233. Maximum sequence length: 2049, sample length: 5058 +[default0]:Skipping sample id=1477768. Maximum sequence length: 2049, sample length: 4581 +[default0]:Skipping sample id=726146. Maximum sequence length: 2049, sample length: 2992 +[default0]:Skipping sample id=1069294. Maximum sequence length: 2049, sample length: 5816 +[default0]:Skipping sample id=948591. Maximum sequence length: 2049, sample length: 4087 +[default0]:Skipping sample id=511648. Maximum sequence length: 2049, sample length: 2057 +[default0]:Skipping sample id=675833. Maximum sequence length: 2049, sample length: 2120 +[default0]:Skipping sample id=884601. Maximum sequence length: 2049, sample length: 2608 +[default0]:Skipping sample id=1453090. Maximum sequence length: 2049, sample length: 2769 +[default0]:Skipping sample id=277987. Maximum sequence length: 2049, sample length: 2430 +[default0]:Skipping sample id=1420766. Maximum sequence length: 2049, sample length: 2254 +[default0]:Skipping sample id=248289. Maximum sequence length: 2049, sample length: 2726 +[default0]:Skipping sample id=947180. Maximum sequence length: 2049, sample length: 2337 +[default0]:Skipping sample id=19482. Maximum sequence length: 2049, sample length: 2367 +[default0]:Skipping sample id=192613. Maximum sequence length: 2049, sample length: 2587 +[default0]:Skipping sample id=1143272. Maximum sequence length: 2049, sample length: 2937 +[default0]:Skipping sample id=135816. Maximum sequence length: 2049, sample length: 4797 +[default0]:Skipping sample id=975580. Maximum sequence length: 2049, sample length: 3448 +[default0]:Skipping sample id=1103135. Maximum sequence length: 2049, sample length: 3094 +[default0]:Skipping sample id=883088. Maximum sequence length: 2049, sample length: 4405 +[default0]:Skipping sample id=36813. Maximum sequence length: 2049, sample length: 2178 +[default0]:Skipping sample id=619145. Maximum sequence length: 2049, sample length: 3726 +[default0]:Skipping sample id=324413. Maximum sequence length: 2049, sample length: 2151 +[default0]:Skipping sample id=729926. Maximum sequence length: 2049, sample length: 2894 +[default0]:Skipping sample id=83077. Maximum sequence length: 2049, sample length: 2195 +[default0]:Skipping sample id=224266. Maximum sequence length: 2049, sample length: 2343 +[default0]:Skipping sample id=1113481. Maximum sequence length: 2049, sample length: 2079 +[default0]:Skipping sample id=249591. Maximum sequence length: 2049, sample length: 2253 +[default0]:Skipping sample id=473134. Maximum sequence length: 2049, sample length: 2445 +[default0]:Skipping sample id=1160446. Maximum sequence length: 2049, sample length: 2576 +[default0]:Skipping sample id=143987. Maximum sequence length: 2049, sample length: 2223 +[default0]:Skipping sample id=796101. Maximum sequence length: 2049, sample length: 3271 +[default0]:Skipping sample id=890778. Maximum sequence length: 2049, sample length: 2729 +[default0]:Skipping sample id=482423. Maximum sequence length: 2049, sample length: 4608 +[default0]:Skipping sample id=16204. Maximum sequence length: 2049, sample length: 2185 +[default0]:Skipping sample id=934215. Maximum sequence length: 2049, sample length: 2157 +[default0]:Skipping sample id=133197. Maximum sequence length: 2049, sample length: 2082 +[default0]:Skipping sample id=1172538. Maximum sequence length: 2049, sample length: 4096 +[default0]:Skipping sample id=734053. Maximum sequence length: 2049, sample length: 2890 +[default0]:Skipping sample id=50835. Maximum sequence length: 2049, sample length: 3133 +[default0]:Skipping sample id=205037. Maximum sequence length: 2049, sample length: 2668 +[default0]:Skipping sample id=1470052. Maximum sequence length: 2049, sample length: 4086 +[default0]:Skipping sample id=537006. Maximum sequence length: 2049, sample length: 2094 +[default0]:Skipping sample id=972173. Maximum sequence length: 2049, sample length: 3711 +[default0]:Skipping sample id=1268072. Maximum sequence length: 2049, sample length: 2684 +[default0]:Skipping sample id=834831. Maximum sequence length: 2049, sample length: 2511 +[default0]:Skipping sample id=1137040. Maximum sequence length: 2049, sample length: 3688 +[default0]:Skipping sample id=240054. Maximum sequence length: 2049, sample length: 2722 +[default0]:Skipping sample id=407415. Maximum sequence length: 2049, sample length: 4047 +[default0]:Skipping sample id=45138. Maximum sequence length: 2049, sample length: 2515 +[default0]:Skipping sample id=1240113. Maximum sequence length: 2049, sample length: 2202 +[default0]:Skipping sample id=207453. Maximum sequence length: 2049, sample length: 2156 +[default0]:Skipping sample id=1396358. Maximum sequence length: 2049, sample length: 2390 +[default0]:Skipping sample id=413315. Maximum sequence length: 2049, sample length: 2505 +[default0]:Skipping sample id=1203953. Maximum sequence length: 2049, sample length: 2590 +[default0]:Skipping sample id=759368. Maximum sequence length: 2049, sample length: 5049 +[default0]:Skipping sample id=860237. Maximum sequence length: 2049, sample length: 3151 +[default0]:Skipping sample id=109076. Maximum sequence length: 2049, sample length: 2517 +[default0]:Skipping sample id=282628. Maximum sequence length: 2049, sample length: 2418 +[default0]:Skipping sample id=1036737. Maximum sequence length: 2049, sample length: 2934 +[default0]:Skipping sample id=398652. Maximum sequence length: 2049, sample length: 3140 +[default0]:Skipping sample id=1366833. Maximum sequence length: 2049, sample length: 4145 +[default0]:Skipping sample id=1185631. Maximum sequence length: 2049, sample length: 4158 +[default0]:Skipping sample id=794054. Maximum sequence length: 2049, sample length: 5318 +[default0]:Skipping sample id=1100774. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=1088924. Maximum sequence length: 2049, sample length: 2442 +[default0]:Skipping sample id=1426203. Maximum sequence length: 2049, sample length: 2912 +[default0]:Skipping sample id=584128. Maximum sequence length: 2049, sample length: 2538 +[default0]:Skipping sample id=368076. Maximum sequence length: 2049, sample length: 3293 +[default0]:Skipping sample id=1510726. Maximum sequence length: 2049, sample length: 2654 +[default0]:Skipping sample id=39860. Maximum sequence length: 2049, sample length: 2833 +[default0]:Skipping sample id=1539632. Maximum sequence length: 2049, sample length: 2246 +[default0]:Skipping sample id=1009490. Maximum sequence length: 2049, sample length: 2898 +[default0]:Skipping sample id=689117. Maximum sequence length: 2049, sample length: 2808 +[default0]:Skipping sample id=29060. Maximum sequence length: 2049, sample length: 2408 +[default0]:Skipping sample id=1192498. Maximum sequence length: 2049, sample length: 2486 +[default0]:Skipping sample id=1551560. Maximum sequence length: 2049, sample length: 2568 +[default0]:Skipping sample id=910942. Maximum sequence length: 2049, sample length: 2360 +[default0]:Skipping sample id=1382801. Maximum sequence length: 2049, sample length: 4362 +[default0]:Skipping sample id=578859. Maximum sequence length: 2049, sample length: 2517 +[default0]:Skipping sample id=1126223. Maximum sequence length: 2049, sample length: 3321 +[default0]:Skipping sample id=519870. Maximum sequence length: 2049, sample length: 2713 +[default0]:Skipping sample id=387927. Maximum sequence length: 2049, sample length: 2063 +[default0]:Skipping sample id=555653. Maximum sequence length: 2049, sample length: 2118 +[default0]:Skipping sample id=110813. Maximum sequence length: 2049, sample length: 4657 +[default0]:Skipping sample id=823476. Maximum sequence length: 2049, sample length: 2774 +[default0]:Skipping sample id=1217303. Maximum sequence length: 2049, sample length: 2836 +[default0]:Skipping sample id=1359287. Maximum sequence length: 2049, sample length: 4795 +[default0]:Skipping sample id=324611. Maximum sequence length: 2049, sample length: 2082 +[default0]:Skipping sample id=907361. Maximum sequence length: 2049, sample length: 2672 +[default0]:Skipping sample id=897209. Maximum sequence length: 2049, sample length: 2700 +[default0]:Skipping sample id=76724. Maximum sequence length: 2049, sample length: 2317 +[default0]:Skipping sample id=987328. Maximum sequence length: 2049, sample length: 3575 +[default0]:Skipping sample id=1452535. Maximum sequence length: 2049, sample length: 2489 +[default0]:Skipping sample id=395726. Maximum sequence length: 2049, sample length: 2619 +[default0]:Skipping sample id=1077272. Maximum sequence length: 2049, sample length: 4342 +[default0]:Skipping sample id=848003. Maximum sequence length: 2049, sample length: 2125 +[default0]:Skipping sample id=652268. Maximum sequence length: 2049, sample length: 2577 +[default0]:Skipping sample id=1344188. Maximum sequence length: 2049, sample length: 4033 +[default0]:Skipping sample id=1512862. Maximum sequence length: 2049, sample length: 3496 +[default0]:Skipping sample id=896875. Maximum sequence length: 2049, sample length: 2556 +[default0]:Skipping sample id=202361. Maximum sequence length: 2049, sample length: 2548 +[default0]:Skipping sample id=1317288. Maximum sequence length: 2049, sample length: 2305 +[default0]:Skipping sample id=1129279. Maximum sequence length: 2049, sample length: 2702 +[default0]:Skipping sample id=181342. Maximum sequence length: 2049, sample length: 3098 +[default0]:Skipping sample id=1437706. Maximum sequence length: 2049, sample length: 2522 +[default0]:Skipping sample id=170720. Maximum sequence length: 2049, sample length: 3455 +[default0]:Skipping sample id=1568823. Maximum sequence length: 2049, sample length: 4577 +[default0]:Skipping sample id=217128. Maximum sequence length: 2049, sample length: 2967 +[default0]:Skipping sample id=773914. Maximum sequence length: 2049, sample length: 2514 +[default0]:Skipping sample id=1352118. Maximum sequence length: 2049, sample length: 2066 +[default0]:Skipping sample id=1237421. Maximum sequence length: 2049, sample length: 3487 +[default0]:Skipping sample id=463306. Maximum sequence length: 2049, sample length: 2867 +[default0]:Skipping sample id=461677. Maximum sequence length: 2049, sample length: 2208 +[default0]:Skipping sample id=38555. Maximum sequence length: 2049, sample length: 3141 +[default0]:Skipping sample id=397904. Maximum sequence length: 2049, sample length: 3729 +[default0]:Skipping sample id=1295117. Maximum sequence length: 2049, sample length: 2445 +[default0]:Skipping sample id=1318257. Maximum sequence length: 2049, sample length: 4023 +[default0]:Skipping sample id=1379073. Maximum sequence length: 2049, sample length: 4563 +[default0]:Skipping sample id=947813. Maximum sequence length: 2049, sample length: 2407 +[default0]:Skipping sample id=1219083. Maximum sequence length: 2049, sample length: 2989 +[default0]:Skipping sample id=1507697. Maximum sequence length: 2049, sample length: 3072 +[default0]:Skipping sample id=640492. Maximum sequence length: 2049, sample length: 2399 +[default0]:Skipping sample id=145307. Maximum sequence length: 2049, sample length: 2486 +[default0]:Skipping sample id=571609. Maximum sequence length: 2049, sample length: 2780 +[default0]:Skipping sample id=1075144. Maximum sequence length: 2049, sample length: 2091 +[default0]:Skipping sample id=23195. Maximum sequence length: 2049, sample length: 2828 +[default0]:Skipping sample id=960670. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=577413. Maximum sequence length: 2049, sample length: 2744 +[default0]:Skipping sample id=590351. Maximum sequence length: 2049, sample length: 3374 +[default0]:Skipping sample id=426580. Maximum sequence length: 2049, sample length: 3551 +[default0]:Skipping sample id=476189. Maximum sequence length: 2049, sample length: 3206 +[default0]:Skipping sample id=922260. Maximum sequence length: 2049, sample length: 3494 +[default0]:Skipping sample id=1461151. Maximum sequence length: 2049, sample length: 3300 +[default0]:Skipping sample id=1396341. Maximum sequence length: 2049, sample length: 2984 +[default0]:Skipping sample id=1097060. Maximum sequence length: 2049, sample length: 2758 +[default0]:Skipping sample id=832255. Maximum sequence length: 2049, sample length: 2178 +[default0]:Skipping sample id=347031. Maximum sequence length: 2049, sample length: 2649 +[default0]:Skipping sample id=845942. Maximum sequence length: 2049, sample length: 2119 +[default0]:Skipping sample id=981373. Maximum sequence length: 2049, sample length: 4048 +[default0]:Skipping sample id=1199832. Maximum sequence length: 2049, sample length: 2484 +[default0]:Skipping sample id=1096918. Maximum sequence length: 2049, sample length: 2452 +[default0]:Skipping sample id=104183. Maximum sequence length: 2049, sample length: 3466 +[default0]:Skipping sample id=1377558. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=1089192. Maximum sequence length: 2049, sample length: 3665 +[default0]:Skipping sample id=1099161. Maximum sequence length: 2049, sample length: 2147 +[default0]:Skipping sample id=1095812. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=226846. Maximum sequence length: 2049, sample length: 2659 +[default0]:Skipping sample id=529755. Maximum sequence length: 2049, sample length: 4719 +[default0]:Skipping sample id=1155297. Maximum sequence length: 2049, sample length: 3315 +[default0]:Skipping sample id=1166878. Maximum sequence length: 2049, sample length: 2721 +[default0]:Skipping sample id=609397. Maximum sequence length: 2049, sample length: 3834 +[default0]:Skipping sample id=234168. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=1561819. Maximum sequence length: 2049, sample length: 2085 +[default0]:Skipping sample id=362532. Maximum sequence length: 2049, sample length: 2234 +[default0]:Skipping sample id=1025859. Maximum sequence length: 2049, sample length: 2621 +[default0]:Skipping sample id=211620. Maximum sequence length: 2049, sample length: 2600 +[default0]:Skipping sample id=1016168. Maximum sequence length: 2049, sample length: 3958 +[default0]:Skipping sample id=281735. Maximum sequence length: 2049, sample length: 2085 +[default0]:Skipping sample id=1516359. Maximum sequence length: 2049, sample length: 2446 +[default0]:Skipping sample id=356104. Maximum sequence length: 2049, sample length: 2900 +[default0]:Skipping sample id=874202. Maximum sequence length: 2049, sample length: 2714 +[default0]:Skipping sample id=249384. Maximum sequence length: 2049, sample length: 2171 +[default0]:Skipping sample id=552098. Maximum sequence length: 2049, sample length: 2055 +[default0]:Skipping sample id=843406. Maximum sequence length: 2049, sample length: 2580 +[default0]:Skipping sample id=260206. Maximum sequence length: 2049, sample length: 2847 +[default0]:Skipping sample id=698765. Maximum sequence length: 2049, sample length: 2768 +[default0]:Skipping sample id=66947. Maximum sequence length: 2049, sample length: 2249 +[default0]:Skipping sample id=957069. Maximum sequence length: 2049, sample length: 2680 +[default0]:Skipping sample id=376515. Maximum sequence length: 2049, sample length: 4283 +[default0]:Skipping sample id=515315. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=282860. Maximum sequence length: 2049, sample length: 4893 +[default0]:Skipping sample id=991819. Maximum sequence length: 2049, sample length: 2592 +[default0]:Skipping sample id=1015349. Maximum sequence length: 2049, sample length: 2465 +[default0]:Skipping sample id=567941. Maximum sequence length: 2049, sample length: 6671 +[default0]:Skipping sample id=985996. Maximum sequence length: 2049, sample length: 3365 +[default0]:Skipping sample id=1362709. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=164153. Maximum sequence length: 2049, sample length: 2712 +[default0]:Skipping sample id=273976. Maximum sequence length: 2049, sample length: 4860 +[default0]:Skipping sample id=825557. Maximum sequence length: 2049, sample length: 2160 +[default0]:Skipping sample id=373558. Maximum sequence length: 2049, sample length: 2412 +[default0]:Skipping sample id=694751. Maximum sequence length: 2049, sample length: 2188 +[default0]:Skipping sample id=263347. Maximum sequence length: 2049, sample length: 4747 +[default0]:Skipping sample id=1305868. Maximum sequence length: 2049, sample length: 2442 +[default0]:Skipping sample id=306530. Maximum sequence length: 2049, sample length: 3008 +[default0]:Skipping sample id=754798. Maximum sequence length: 2049, sample length: 2083 +[default0]:Skipping sample id=373741. Maximum sequence length: 2049, sample length: 3153 +[default0]:Skipping sample id=1548896. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=47094. Maximum sequence length: 2049, sample length: 2988 +[default0]:Skipping sample id=497532. Maximum sequence length: 2049, sample length: 2260 +[default0]:Skipping sample id=353763. Maximum sequence length: 2049, sample length: 2228 +[default0]:Skipping sample id=1192735. Maximum sequence length: 2049, sample length: 2424 +[default0]:Skipping sample id=447675. Maximum sequence length: 2049, sample length: 2121 +[default0]:Skipping sample id=637264. Maximum sequence length: 2049, sample length: 2890 +[default0]:Skipping sample id=349071. Maximum sequence length: 2049, sample length: 3163 +[default0]:Skipping sample id=307306. Maximum sequence length: 2049, sample length: 2537 +[default0]:Skipping sample id=746435. Maximum sequence length: 2049, sample length: 2415 +[default0]:Skipping sample id=1110271. Maximum sequence length: 2049, sample length: 2453 +[default0]:Skipping sample id=1264090. Maximum sequence length: 2049, sample length: 3405 +[default0]:Skipping sample id=1077810. Maximum sequence length: 2049, sample length: 4886 +[default0]:Skipping sample id=439669. Maximum sequence length: 2049, sample length: 2584 +[default0]:Skipping sample id=247350. Maximum sequence length: 2049, sample length: 2085 +[default0]:Skipping sample id=903267. Maximum sequence length: 2049, sample length: 2220 +[default0]:Skipping sample id=711211. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=978048. Maximum sequence length: 2049, sample length: 2138 +[default0]:Skipping sample id=730409. Maximum sequence length: 2049, sample length: 2113 +[default0]:Skipping sample id=296603. Maximum sequence length: 2049, sample length: 3290 +[default0]:Skipping sample id=1180244. Maximum sequence length: 2049, sample length: 3695 +[default0]:Skipping sample id=345049. Maximum sequence length: 2049, sample length: 2508 +[default0]:Skipping sample id=307499. Maximum sequence length: 2049, sample length: 4617 +[default0]:Skipping sample id=345611. Maximum sequence length: 2049, sample length: 2532 +[default0]:Skipping sample id=1038494. Maximum sequence length: 2049, sample length: 2072 +[default0]:Skipping sample id=575982. Maximum sequence length: 2049, sample length: 2925 +[default0]:Skipping sample id=1215778. Maximum sequence length: 2049, sample length: 2068 +[default0]:Skipping sample id=1501424. Maximum sequence length: 2049, sample length: 3120 +[default0]:Skipping sample id=928150. Maximum sequence length: 2049, sample length: 2217 +[default0]:Skipping sample id=1541837. Maximum sequence length: 2049, sample length: 3387 +[default0]:Skipping sample id=936022. Maximum sequence length: 2049, sample length: 2572 +[default0]:Skipping sample id=99507. Maximum sequence length: 2049, sample length: 2376 +[default0]:Skipping sample id=562965. Maximum sequence length: 2049, sample length: 2353 +[default0]:Skipping sample id=419786. Maximum sequence length: 2049, sample length: 2283 +[default0]:Skipping sample id=53712. Maximum sequence length: 2049, sample length: 2282 +[default0]:Skipping sample id=604706. Maximum sequence length: 2049, sample length: 4570 +[default0]:Skipping sample id=591106. Maximum sequence length: 2049, sample length: 2461 +[default0]:Skipping sample id=877975. Maximum sequence length: 2049, sample length: 5428 +[default0]:Skipping sample id=1013470. Maximum sequence length: 2049, sample length: 2567 +[default0]:Skipping sample id=357853. Maximum sequence length: 2049, sample length: 2977 +[default0]:Skipping sample id=1465994. Maximum sequence length: 2049, sample length: 2890 +[default0]:Skipping sample id=801771. Maximum sequence length: 2049, sample length: 3396 +[default0]:Skipping sample id=915187. Maximum sequence length: 2049, sample length: 3288 +[default0]:Skipping sample id=724299. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=1201616. Maximum sequence length: 2049, sample length: 2584 +[default0]:Skipping sample id=244060. Maximum sequence length: 2049, sample length: 2334 +[default0]:Skipping sample id=1422673. Maximum sequence length: 2049, sample length: 3390 +[default0]:Skipping sample id=907335. Maximum sequence length: 2049, sample length: 4643 +[default0]:Skipping sample id=649231. Maximum sequence length: 2049, sample length: 2873 +[default0]:Skipping sample id=471749. Maximum sequence length: 2049, sample length: 2692 +[default0]:Skipping sample id=419382. Maximum sequence length: 2049, sample length: 4762 +[default0]:Skipping sample id=384050. Maximum sequence length: 2049, sample length: 3181 +[default0]:Skipping sample id=811979. Maximum sequence length: 2049, sample length: 2632 +[default0]:Skipping sample id=471271. Maximum sequence length: 2049, sample length: 2465 +[default0]:Skipping sample id=686260. Maximum sequence length: 2049, sample length: 5791 +[default0]:Skipping sample id=57373. Maximum sequence length: 2049, sample length: 4243 +[default0]:Skipping sample id=287788. Maximum sequence length: 2049, sample length: 3516 +[default0]:Skipping sample id=163395. Maximum sequence length: 2049, sample length: 4888 +[default0]:Skipping sample id=1473485. Maximum sequence length: 2049, sample length: 3378 +[default0]:Skipping sample id=855385. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=429284. Maximum sequence length: 2049, sample length: 2891 +[default0]:Skipping sample id=840307. Maximum sequence length: 2049, sample length: 4167 +[default0]:Skipping sample id=119677. Maximum sequence length: 2049, sample length: 2757 +[default0]:Skipping sample id=127216. Maximum sequence length: 2049, sample length: 3019 +[default0]:Skipping sample id=1097459. Maximum sequence length: 2049, sample length: 3643 +[default0]:Skipping sample id=878198. Maximum sequence length: 2049, sample length: 3420 +[default0]:Skipping sample id=1212492. Maximum sequence length: 2049, sample length: 2080 +[default0]:Skipping sample id=375136. Maximum sequence length: 2049, sample length: 2293 +[default0]:Skipping sample id=872553. Maximum sequence length: 2049, sample length: 2063 +[default0]:Skipping sample id=1362847. Maximum sequence length: 2049, sample length: 4133 +[default0]:Skipping sample id=1213719. Maximum sequence length: 2049, sample length: 2269 +[default0]:Skipping sample id=975680. Maximum sequence length: 2049, sample length: 4322 +[default0]:Skipping sample id=714101. Maximum sequence length: 2049, sample length: 2421 +[default0]:Skipping sample id=687575. Maximum sequence length: 2049, sample length: 3463 +[default0]:Skipping sample id=1273461. Maximum sequence length: 2049, sample length: 2166 +[default0]:Skipping sample id=538247. Maximum sequence length: 2049, sample length: 3377 +[default0]:Skipping sample id=1499193. Maximum sequence length: 2049, sample length: 4181 +[default0]:Skipping sample id=830148. Maximum sequence length: 2049, sample length: 3091 +[default0]:Skipping sample id=940523. Maximum sequence length: 2049, sample length: 2950 +[default0]:Skipping sample id=450724. Maximum sequence length: 2049, sample length: 2153 +[default0]:Skipping sample id=1354248. Maximum sequence length: 2049, sample length: 3559 +[default0]:Skipping sample id=69710. Maximum sequence length: 2049, sample length: 2835 +[default0]:Skipping sample id=388033. Maximum sequence length: 2049, sample length: 2687 +[default0]:Skipping sample id=816873. Maximum sequence length: 2049, sample length: 2644 +[default0]:Skipping sample id=302949. Maximum sequence length: 2049, sample length: 2191 +[default0]:Skipping sample id=678482. Maximum sequence length: 2049, sample length: 2683 +[default0]:Skipping sample id=340790. Maximum sequence length: 2049, sample length: 4012 +[default0]:Skipping sample id=725922. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=30986. Maximum sequence length: 2049, sample length: 4105 +[default0]:Skipping sample id=421542. Maximum sequence length: 2049, sample length: 3897 +[default0]:Skipping sample id=119850. Maximum sequence length: 2049, sample length: 5028 +[default0]:Skipping sample id=1235797. Maximum sequence length: 2049, sample length: 3904 +[default0]:Skipping sample id=482422. Maximum sequence length: 2049, sample length: 2816 +[default0]:Skipping sample id=1163447. Maximum sequence length: 2049, sample length: 4231 +[default0]:Skipping sample id=210115. Maximum sequence length: 2049, sample length: 3107 +[default0]:Skipping sample id=1397119. Maximum sequence length: 2049, sample length: 4120 +[default0]:Skipping sample id=856792. Maximum sequence length: 2049, sample length: 2664 +[default0]:Skipping sample id=967498. Maximum sequence length: 2049, sample length: 2410 +[default0]:Skipping sample id=440569. Maximum sequence length: 2049, sample length: 2396 +[default0]:Skipping sample id=358012. Maximum sequence length: 2049, sample length: 2147 +[default0]:Skipping sample id=740858. Maximum sequence length: 2049, sample length: 2755 +[default0]:Skipping sample id=426715. Maximum sequence length: 2049, sample length: 2053 +[default0]:Skipping sample id=899088. Maximum sequence length: 2049, sample length: 5070 +[default0]:Skipping sample id=466037. Maximum sequence length: 2049, sample length: 2171 +[default0]:Skipping sample id=177448. Maximum sequence length: 2049, sample length: 2598 +[default0]:Skipping sample id=109846. Maximum sequence length: 2049, sample length: 2412 +[default0]:Skipping sample id=1558077. Maximum sequence length: 2049, sample length: 3410 +[default0]:Skipping sample id=602038. Maximum sequence length: 2049, sample length: 4161 +[default0]:Skipping sample id=743765. Maximum sequence length: 2049, sample length: 2243 +[default0]:Skipping sample id=251655. Maximum sequence length: 2049, sample length: 3721 +[default0]:Skipping sample id=634662. Maximum sequence length: 2049, sample length: 5315 +[default0]:Skipping sample id=92718. Maximum sequence length: 2049, sample length: 2357 +[default0]:Skipping sample id=125710. Maximum sequence length: 2049, sample length: 2918 +[default0]:Skipping sample id=350557. Maximum sequence length: 2049, sample length: 3321 +[default0]:Skipping sample id=1279682. Maximum sequence length: 2049, sample length: 6769 +[default0]:Skipping sample id=697931. Maximum sequence length: 2049, sample length: 2662 +[default0]:Skipping sample id=1274637. Maximum sequence length: 2049, sample length: 2168 +[default0]:Skipping sample id=483187. Maximum sequence length: 2049, sample length: 2736 +[default0]:Skipping sample id=990394. Maximum sequence length: 2049, sample length: 3352 +[default0]:Skipping sample id=212739. Maximum sequence length: 2049, sample length: 2633 +[default0]:Skipping sample id=82290. Maximum sequence length: 2049, sample length: 2367 +[default0]:Skipping sample id=121822. Maximum sequence length: 2049, sample length: 3015 +[default0]:Skipping sample id=771305. Maximum sequence length: 2049, sample length: 2263 +[default0]:Skipping sample id=758731. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=1089917. Maximum sequence length: 2049, sample length: 2204 +[default0]:Skipping sample id=74907. Maximum sequence length: 2049, sample length: 2287 +[default0]:Skipping sample id=188829. Maximum sequence length: 2049, sample length: 3112 +[default0]:Skipping sample id=178384. Maximum sequence length: 2049, sample length: 4602 +[default0]:Skipping sample id=95922. Maximum sequence length: 2049, sample length: 3705 +[default0]:Skipping sample id=1526090. Maximum sequence length: 2049, sample length: 2723 +[default0]:Skipping sample id=820295. Maximum sequence length: 2049, sample length: 2539 +[default0]:Skipping sample id=328182. Maximum sequence length: 2049, sample length: 2560 +[default0]:Skipping sample id=893396. Maximum sequence length: 2049, sample length: 2815 +[default0]:Skipping sample id=607300. Maximum sequence length: 2049, sample length: 4604 +[default0]:Skipping sample id=852284. Maximum sequence length: 2049, sample length: 2062 +[default0]:Skipping sample id=837309. Maximum sequence length: 2049, sample length: 2069 +[default0]:Skipping sample id=1555619. Maximum sequence length: 2049, sample length: 3324 +[default0]:Skipping sample id=111203. Maximum sequence length: 2049, sample length: 2292 +[default0]:Skipping sample id=606499. Maximum sequence length: 2049, sample length: 5656 +[default0]:Skipping sample id=215715. Maximum sequence length: 2049, sample length: 3372 +[default0]:Skipping sample id=133632. Maximum sequence length: 2049, sample length: 3128 +[default0]:Skipping sample id=362944. Maximum sequence length: 2049, sample length: 3935 +[default0]:Skipping sample id=654985. Maximum sequence length: 2049, sample length: 3078 +[default0]:Skipping sample id=618084. Maximum sequence length: 2049, sample length: 2496 +[default0]:Skipping sample id=1314456. Maximum sequence length: 2049, sample length: 6672 +[default0]:Skipping sample id=313015. Maximum sequence length: 2049, sample length: 2607 +[default0]:Skipping sample id=1445964. Maximum sequence length: 2049, sample length: 2255 +[default0]:Skipping sample id=1080546. Maximum sequence length: 2049, sample length: 5134 +[default0]:Skipping sample id=152812. Maximum sequence length: 2049, sample length: 2848 +[default0]:Skipping sample id=1384939. Maximum sequence length: 2049, sample length: 3069 +[default0]:Skipping sample id=19364. Maximum sequence length: 2049, sample length: 3493 +[default0]:Skipping sample id=1061592. Maximum sequence length: 2049, sample length: 4698 +[default0]:Skipping sample id=1232700. Maximum sequence length: 2049, sample length: 2808 +[default0]:Skipping sample id=288663. Maximum sequence length: 2049, sample length: 2322 +[default0]:Skipping sample id=350211. Maximum sequence length: 2049, sample length: 2502 +[default0]:Skipping sample id=74098. Maximum sequence length: 2049, sample length: 3888 +[default0]:Skipping sample id=797180. Maximum sequence length: 2049, sample length: 3185 +[default0]:Skipping sample id=81667. Maximum sequence length: 2049, sample length: 2485 +[default0]:Skipping sample id=689127. Maximum sequence length: 2049, sample length: 6339 +[default0]:Skipping sample id=703367. Maximum sequence length: 2049, sample length: 2268 +[default0]:Skipping sample id=75378. Maximum sequence length: 2049, sample length: 2895 +[default0]:Skipping sample id=535360. Maximum sequence length: 2049, sample length: 2320 +[default0]:Skipping sample id=670504. Maximum sequence length: 2049, sample length: 2728 +[default0]:Skipping sample id=246930. Maximum sequence length: 2049, sample length: 2316 +[default0]:Skipping sample id=584348. Maximum sequence length: 2049, sample length: 2295 +[default0]:Skipping sample id=837208. Maximum sequence length: 2049, sample length: 2233 +[default0]:Skipping sample id=1055446. Maximum sequence length: 2049, sample length: 2728 +[default0]:Skipping sample id=648414. Maximum sequence length: 2049, sample length: 2761 +[default0]:Skipping sample id=371217. Maximum sequence length: 2049, sample length: 2153 +[default0]:Skipping sample id=151869. Maximum sequence length: 2049, sample length: 2430 +[default0]:Skipping sample id=430682. Maximum sequence length: 2049, sample length: 5876 +[default0]:Skipping sample id=1289488. Maximum sequence length: 2049, sample length: 3080 +[default0]:Skipping sample id=463230. Maximum sequence length: 2049, sample length: 2648 +[default0]:Skipping sample id=221548. Maximum sequence length: 2049, sample length: 2114 +[default0]:Skipping sample id=778942. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=52322. Maximum sequence length: 2049, sample length: 2717 +[default0]:Skipping sample id=1082359. Maximum sequence length: 2049, sample length: 2138 +[default0]:Skipping sample id=773387. Maximum sequence length: 2049, sample length: 3156 +[default0]:Skipping sample id=1517022. Maximum sequence length: 2049, sample length: 3479 +[default0]:Skipping sample id=1435523. Maximum sequence length: 2049, sample length: 6325 +[default0]:Skipping sample id=735285. Maximum sequence length: 2049, sample length: 2961 +[default0]:Skipping sample id=1006288. Maximum sequence length: 2049, sample length: 2274 +[default0]:Skipping sample id=190529. Maximum sequence length: 2049, sample length: 2430 +[default0]:Skipping sample id=788420. Maximum sequence length: 2049, sample length: 2498 +[default0]:Skipping sample id=626516. Maximum sequence length: 2049, sample length: 2059 +[default0]:Skipping sample id=670836. Maximum sequence length: 2049, sample length: 2821 +[default0]:Skipping sample id=404959. Maximum sequence length: 2049, sample length: 2234 +[default0]:Skipping sample id=1432553. Maximum sequence length: 2049, sample length: 2895 +[default0]:Skipping sample id=537454. Maximum sequence length: 2049, sample length: 2812 +[default0]:Skipping sample id=1227432. Maximum sequence length: 2049, sample length: 2524 +[default0]:Skipping sample id=73888. Maximum sequence length: 2049, sample length: 3361 +[default0]:Skipping sample id=351800. Maximum sequence length: 2049, sample length: 3613 +[default0]:Skipping sample id=1028919. Maximum sequence length: 2049, sample length: 2749 +[default0]:Skipping sample id=336212. Maximum sequence length: 2049, sample length: 4374 +[default0]:Skipping sample id=833593. Maximum sequence length: 2049, sample length: 2164 +[default0]:Skipping sample id=1263544. Maximum sequence length: 2049, sample length: 4257 +[default0]:Skipping sample id=1204032. Maximum sequence length: 2049, sample length: 3035 +[default0]:Skipping sample id=8362. Maximum sequence length: 2049, sample length: 2096 +[default0]:Skipping sample id=851387. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=1301359. Maximum sequence length: 2049, sample length: 2135 +[default0]:Skipping sample id=38681. Maximum sequence length: 2049, sample length: 2282 +[default0]:Skipping sample id=1062288. Maximum sequence length: 2049, sample length: 2164 +[default0]:Skipping sample id=1396924. Maximum sequence length: 2049, sample length: 2056 +[default0]:Skipping sample id=678621. Maximum sequence length: 2049, sample length: 2542 +[default0]:Skipping sample id=1235030. Maximum sequence length: 2049, sample length: 3200 +[default0]:Skipping sample id=1319794. Maximum sequence length: 2049, sample length: 4090 +[default0]:Skipping sample id=422353. Maximum sequence length: 2049, sample length: 2583 +[default0]:Skipping sample id=391049. Maximum sequence length: 2049, sample length: 2060 +[default0]:Skipping sample id=1397159. Maximum sequence length: 2049, sample length: 2664 +[default0]:Skipping sample id=185094. Maximum sequence length: 2049, sample length: 2960 +[default0]:Skipping sample id=630828. Maximum sequence length: 2049, sample length: 2290 +[default0]:Skipping sample id=374270. Maximum sequence length: 2049, sample length: 2653 +[default0]:Skipping sample id=1381999. Maximum sequence length: 2049, sample length: 4200 +[default0]:Skipping sample id=562473. Maximum sequence length: 2049, sample length: 3352 +[default0]:Skipping sample id=251372. Maximum sequence length: 2049, sample length: 2093 +[default0]:Skipping sample id=134425. Maximum sequence length: 2049, sample length: 4603 +[default0]:Skipping sample id=1567303. Maximum sequence length: 2049, sample length: 2834 +[default0]:Skipping sample id=906733. Maximum sequence length: 2049, sample length: 2415 +[default0]:Skipping sample id=837871. Maximum sequence length: 2049, sample length: 3866 +[default0]:Skipping sample id=1532779. Maximum sequence length: 2049, sample length: 2474 +[default0]:Skipping sample id=1492728. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=994765. Maximum sequence length: 2049, sample length: 3829 +[default0]:Skipping sample id=1011523. Maximum sequence length: 2049, sample length: 2759 +[default0]:Skipping sample id=55586. Maximum sequence length: 2049, sample length: 2958 +[default0]:Skipping sample id=977528. Maximum sequence length: 2049, sample length: 2608 +[default0]:Skipping sample id=234612. Maximum sequence length: 2049, sample length: 2264 +[default0]:Skipping sample id=1314462. Maximum sequence length: 2049, sample length: 2737 +[default0]:Skipping sample id=187438. Maximum sequence length: 2049, sample length: 2730 +[default0]:Skipping sample id=716537. Maximum sequence length: 2049, sample length: 2408 +[default0]:Skipping sample id=266791. Maximum sequence length: 2049, sample length: 2224 +[default0]:Skipping sample id=1397878. Maximum sequence length: 2049, sample length: 2162 +[default0]:Skipping sample id=1501401. Maximum sequence length: 2049, sample length: 2411 +[default0]:Skipping sample id=809143. Maximum sequence length: 2049, sample length: 2524 +[default0]:Skipping sample id=1403683. Maximum sequence length: 2049, sample length: 2151 +[default0]:Skipping sample id=165080. Maximum sequence length: 2049, sample length: 3699 +[default0]:Skipping sample id=835385. Maximum sequence length: 2049, sample length: 2543 +[default0]:Skipping sample id=1120960. Maximum sequence length: 2049, sample length: 2694 +[default0]:Skipping sample id=1174145. Maximum sequence length: 2049, sample length: 3637 +[default0]:Skipping sample id=685533. Maximum sequence length: 2049, sample length: 2439 +[default0]:Skipping sample id=1420141. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=1085639. Maximum sequence length: 2049, sample length: 3440 +[default0]:Skipping sample id=1066376. Maximum sequence length: 2049, sample length: 3386 +[default0]:Skipping sample id=674359. Maximum sequence length: 2049, sample length: 2723 +[default0]:Skipping sample id=995483. Maximum sequence length: 2049, sample length: 2086 +[default0]:Skipping sample id=481877. Maximum sequence length: 2049, sample length: 2702 +[default0]:Skipping sample id=175429. Maximum sequence length: 2049, sample length: 2447 +[default0]:Skipping sample id=195201. Maximum sequence length: 2049, sample length: 2072 +[default0]:Skipping sample id=725172. Maximum sequence length: 2049, sample length: 2248 +[default0]:Skipping sample id=1485865. Maximum sequence length: 2049, sample length: 2654 +[default0]:Skipping sample id=1298633. Maximum sequence length: 2049, sample length: 2052 +[default0]:Skipping sample id=14226. Maximum sequence length: 2049, sample length: 2959 +[default0]:Skipping sample id=889237. Maximum sequence length: 2049, sample length: 2874 +[default0]:Skipping sample id=851721. Maximum sequence length: 2049, sample length: 2651 +[default0]:Skipping sample id=424710. Maximum sequence length: 2049, sample length: 2531 +[default0]:Skipping sample id=1080183. Maximum sequence length: 2049, sample length: 2321 +[default0]:Skipping sample id=100457. Maximum sequence length: 2049, sample length: 2602 +[default0]:Skipping sample id=1388694. Maximum sequence length: 2049, sample length: 3053 +[default0]:Skipping sample id=1393733. Maximum sequence length: 2049, sample length: 2895 +[default0]:Skipping sample id=1558434. Maximum sequence length: 2049, sample length: 2559 +[default0]:Skipping sample id=1401123. Maximum sequence length: 2049, sample length: 2768 +[default0]:Skipping sample id=781655. Maximum sequence length: 2049, sample length: 4073 +[default0]:Skipping sample id=539995. Maximum sequence length: 2049, sample length: 2386 +[default0]:Skipping sample id=261449. Maximum sequence length: 2049, sample length: 2214 +[default0]:Skipping sample id=34294. Maximum sequence length: 2049, sample length: 6443 +[default0]:Skipping sample id=546604. Maximum sequence length: 2049, sample length: 2630 +[default0]:Skipping sample id=884650. Maximum sequence length: 2049, sample length: 3047 +[default0]:Skipping sample id=1345585. Maximum sequence length: 2049, sample length: 2849 +[default0]:Skipping sample id=339987. Maximum sequence length: 2049, sample length: 2267 +[default0]:Skipping sample id=1312704. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=760549. Maximum sequence length: 2049, sample length: 4000 +[default0]:Skipping sample id=989631. Maximum sequence length: 2049, sample length: 2073 +[default0]:Skipping sample id=330994. Maximum sequence length: 2049, sample length: 2116 +[default0]:Skipping sample id=1022983. Maximum sequence length: 2049, sample length: 2310 +[default0]:Skipping sample id=411495. Maximum sequence length: 2049, sample length: 2691 +[default0]:Skipping sample id=1265712. Maximum sequence length: 2049, sample length: 5731 +[default0]:Skipping sample id=465348. Maximum sequence length: 2049, sample length: 2338 +[default0]:Skipping sample id=1421479. Maximum sequence length: 2049, sample length: 4084 +[default0]:Skipping sample id=1340045. Maximum sequence length: 2049, sample length: 3130 +[default0]:Skipping sample id=827368. Maximum sequence length: 2049, sample length: 2479 +[default0]:Skipping sample id=1281922. Maximum sequence length: 2049, sample length: 2876 +[default0]:Skipping sample id=816901. Maximum sequence length: 2049, sample length: 2270 +[default0]:Skipping sample id=1271297. Maximum sequence length: 2049, sample length: 2375 +[default0]:Skipping sample id=1564988. Maximum sequence length: 2049, sample length: 2544 +[default0]:Skipping sample id=342112. Maximum sequence length: 2049, sample length: 2464 +[default0]:Skipping sample id=1287390. Maximum sequence length: 2049, sample length: 3378 +[default0]:Skipping sample id=458882. Maximum sequence length: 2049, sample length: 2189 +[default0]:Skipping sample id=955784. Maximum sequence length: 2049, sample length: 4756 +[default0]:Skipping sample id=1266072. Maximum sequence length: 2049, sample length: 2252 +[default0]:Skipping sample id=953120. Maximum sequence length: 2049, sample length: 2674 +[default0]:Skipping sample id=860629. Maximum sequence length: 2049, sample length: 7506 +[default0]:Skipping sample id=1187385. Maximum sequence length: 2049, sample length: 2858 +[default0]:Skipping sample id=1097968. Maximum sequence length: 2049, sample length: 2246 +[default0]:Skipping sample id=303634. Maximum sequence length: 2049, sample length: 2148 +[default0]:Skipping sample id=303645. Maximum sequence length: 2049, sample length: 2484 +[default0]:Skipping sample id=532265. Maximum sequence length: 2049, sample length: 2633 +[default0]:Skipping sample id=760118. Maximum sequence length: 2049, sample length: 3340 +[default0]:Skipping sample id=1051544. Maximum sequence length: 2049, sample length: 3431 +[default0]:Skipping sample id=34719. Maximum sequence length: 2049, sample length: 4053 +[default0]:Skipping sample id=251808. Maximum sequence length: 2049, sample length: 2183 +[default0]:Skipping sample id=801103. Maximum sequence length: 2049, sample length: 5619 +[default0]:Skipping sample id=1368061. Maximum sequence length: 2049, sample length: 3797 +[default0]:Skipping sample id=1264739. Maximum sequence length: 2049, sample length: 2509 +[default0]:Skipping sample id=1164637. Maximum sequence length: 2049, sample length: 6961 +[default0]:Skipping sample id=1053284. Maximum sequence length: 2049, sample length: 3837 +[default0]:Skipping sample id=666434. Maximum sequence length: 2049, sample length: 4488 +[default0]:Skipping sample id=365741. Maximum sequence length: 2049, sample length: 2353 +[default0]:Skipping sample id=1232760. Maximum sequence length: 2049, sample length: 3979 +[default0]:Skipping sample id=1461653. Maximum sequence length: 2049, sample length: 2189 +[default0]:Skipping sample id=1375854. Maximum sequence length: 2049, sample length: 2535 +[default0]:Skipping sample id=918626. Maximum sequence length: 2049, sample length: 3687 +[default0]:Skipping sample id=1320559. Maximum sequence length: 2049, sample length: 2406 +[default0]:Skipping sample id=10326. Maximum sequence length: 2049, sample length: 2478 +[default0]:Skipping sample id=1055193. Maximum sequence length: 2049, sample length: 2495 +[default0]:Skipping sample id=970820. Maximum sequence length: 2049, sample length: 2395 +[default0]:Skipping sample id=1088873. Maximum sequence length: 2049, sample length: 5395 +[default0]:Skipping sample id=1046778. Maximum sequence length: 2049, sample length: 3383 +[default0]:Skipping sample id=872290. Maximum sequence length: 2049, sample length: 2098 +[default0]:Skipping sample id=989444. Maximum sequence length: 2049, sample length: 2924 +[default0]:Skipping sample id=452585. Maximum sequence length: 2049, sample length: 3554 +[default0]:Skipping sample id=25718. Maximum sequence length: 2049, sample length: 3360 +[default0]:Skipping sample id=725776. Maximum sequence length: 2049, sample length: 2606 +[default0]:Skipping sample id=845329. Maximum sequence length: 2049, sample length: 3880 +[default0]:Skipping sample id=88432. Maximum sequence length: 2049, sample length: 2402 +[default0]:Skipping sample id=178044. Maximum sequence length: 2049, sample length: 3407 +[default0]:Skipping sample id=975488. Maximum sequence length: 2049, sample length: 2304 +[default0]:Skipping sample id=1341554. Maximum sequence length: 2049, sample length: 2626 +[default0]:Skipping sample id=813028. Maximum sequence length: 2049, sample length: 3746 +[default0]:Skipping sample id=812436. Maximum sequence length: 2049, sample length: 4509 +[default0]:Skipping sample id=1518157. Maximum sequence length: 2049, sample length: 2176 +[default0]:Skipping sample id=390974. Maximum sequence length: 2049, sample length: 3463 +[default0]:Skipping sample id=1460202. Maximum sequence length: 2049, sample length: 2285 +[default0]:Skipping sample id=28908. Maximum sequence length: 2049, sample length: 3062 +[default0]:Skipping sample id=1257662. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=1493605. Maximum sequence length: 2049, sample length: 4365 +[default0]:Skipping sample id=1469590. Maximum sequence length: 2049, sample length: 2416 +[default0]:Skipping sample id=264562. Maximum sequence length: 2049, sample length: 3672 +[default0]:Skipping sample id=267949. Maximum sequence length: 2049, sample length: 2504 +[default0]:Skipping sample id=487620. Maximum sequence length: 2049, sample length: 3209 +[default0]:Skipping sample id=808929. Maximum sequence length: 2049, sample length: 3460 +[default0]:Skipping sample id=88804. Maximum sequence length: 2049, sample length: 2306 +[default0]:Skipping sample id=1407715. Maximum sequence length: 2049, sample length: 2268 +[default0]:Skipping sample id=1511636. Maximum sequence length: 2049, sample length: 2813 +[default0]:Skipping sample id=97965. Maximum sequence length: 2049, sample length: 2623 +[default0]:Skipping sample id=656051. Maximum sequence length: 2049, sample length: 2119 +[default0]:Skipping sample id=66339. Maximum sequence length: 2049, sample length: 3887 +[default0]:Skipping sample id=159959. Maximum sequence length: 2049, sample length: 3814 +[default0]:Skipping sample id=82622. Maximum sequence length: 2049, sample length: 2338 +[default0]:Skipping sample id=1047126. Maximum sequence length: 2049, sample length: 2088 +[default0]:Skipping sample id=168053. Maximum sequence length: 2049, sample length: 2494 +[default0]:Skipping sample id=285795. Maximum sequence length: 2049, sample length: 2910 +[default0]:Skipping sample id=854551. Maximum sequence length: 2049, sample length: 2069 +[default0]:Skipping sample id=47992. Maximum sequence length: 2049, sample length: 3552 +[default0]:Skipping sample id=419169. Maximum sequence length: 2049, sample length: 5177 +[default0]:Skipping sample id=620835. Maximum sequence length: 2049, sample length: 3060 +[default0]:Skipping sample id=1554267. Maximum sequence length: 2049, sample length: 4000 +[default0]:Skipping sample id=1137588. Maximum sequence length: 2049, sample length: 2111 +[default0]:Skipping sample id=1397047. Maximum sequence length: 2049, sample length: 2800 +[default0]:Skipping sample id=850821. Maximum sequence length: 2049, sample length: 2323 +[default0]:Skipping sample id=1226071. Maximum sequence length: 2049, sample length: 3960 +[default0]:Skipping sample id=526311. Maximum sequence length: 2049, sample length: 2724 +[default0]:Skipping sample id=1445962. Maximum sequence length: 2049, sample length: 2352 +[default0]:Skipping sample id=25903. Maximum sequence length: 2049, sample length: 2245 +[default0]:Skipping sample id=434331. Maximum sequence length: 2049, sample length: 3106 +[default0]:Skipping sample id=830272. Maximum sequence length: 2049, sample length: 2368 +[default0]:Skipping sample id=395665. Maximum sequence length: 2049, sample length: 2461 +[default0]:Skipping sample id=911412. Maximum sequence length: 2049, sample length: 2890 +[default0]:Skipping sample id=314427. Maximum sequence length: 2049, sample length: 6658 +[default0]:Skipping sample id=1182550. Maximum sequence length: 2049, sample length: 3381 +[default0]:Skipping sample id=536589. Maximum sequence length: 2049, sample length: 2333 +[default0]:Skipping sample id=731492. Maximum sequence length: 2049, sample length: 3706 +[default0]:Skipping sample id=1354022. Maximum sequence length: 2049, sample length: 4605 +[default0]:Skipping sample id=1267166. Maximum sequence length: 2049, sample length: 4909 +[default0]:Skipping sample id=723652. Maximum sequence length: 2049, sample length: 2490 +[default0]:Skipping sample id=567984. Maximum sequence length: 2049, sample length: 2731 +[default0]:Skipping sample id=1279508. Maximum sequence length: 2049, sample length: 2357 +[default0]:Skipping sample id=1502737. Maximum sequence length: 2049, sample length: 2896 +[default0]:Skipping sample id=1013383. Maximum sequence length: 2049, sample length: 2129 +[default0]:Skipping sample id=25264. Maximum sequence length: 2049, sample length: 2748 +[default0]:Skipping sample id=1428631. Maximum sequence length: 2049, sample length: 2224 +[default0]:Skipping sample id=795377. Maximum sequence length: 2049, sample length: 3145 +[default0]:Skipping sample id=339330. Maximum sequence length: 2049, sample length: 2631 +[default0]:Skipping sample id=345138. Maximum sequence length: 2049, sample length: 2382 +[default0]:Skipping sample id=988870. Maximum sequence length: 2049, sample length: 2621 +[default0]:Skipping sample id=1345817. Maximum sequence length: 2049, sample length: 3055 +[default0]:Skipping sample id=1428096. Maximum sequence length: 2049, sample length: 2821 +[default0]:Skipping sample id=225439. Maximum sequence length: 2049, sample length: 2743 +[default0]:Skipping sample id=1502451. Maximum sequence length: 2049, sample length: 3369 +[default0]:Skipping sample id=962667. Maximum sequence length: 2049, sample length: 2580 +[default0]:Skipping sample id=417861. Maximum sequence length: 2049, sample length: 2077 +[default0]:Skipping sample id=1559753. Maximum sequence length: 2049, sample length: 3067 +[default0]:Skipping sample id=203091. Maximum sequence length: 2049, sample length: 4361 +[default0]:Skipping sample id=1277034. Maximum sequence length: 2049, sample length: 2702 +[default0]:Skipping sample id=813195. Maximum sequence length: 2049, sample length: 3835 +[default0]:Skipping sample id=407005. Maximum sequence length: 2049, sample length: 2860 +[default0]:Skipping sample id=1041731. Maximum sequence length: 2049, sample length: 2103 +[default0]:Skipping sample id=556174. Maximum sequence length: 2049, sample length: 3274 +[default0]:Skipping sample id=1282530. Maximum sequence length: 2049, sample length: 2659 +[default0]:Skipping sample id=769863. Maximum sequence length: 2049, sample length: 3324 +[default0]:Skipping sample id=828926. Maximum sequence length: 2049, sample length: 2194 +[default0]:Skipping sample id=442543. Maximum sequence length: 2049, sample length: 2362 +[default0]:Skipping sample id=1415518. Maximum sequence length: 2049, sample length: 2913 +[default0]:Skipping sample id=902751. Maximum sequence length: 2049, sample length: 2395 +[default0]:Skipping sample id=1313409. Maximum sequence length: 2049, sample length: 3554 +[default0]:Skipping sample id=175007. Maximum sequence length: 2049, sample length: 3080 +[default0]:Skipping sample id=529195. Maximum sequence length: 2049, sample length: 3097 +[default0]:Skipping sample id=1322087. Maximum sequence length: 2049, sample length: 3645 +[default0]:Skipping sample id=565326. Maximum sequence length: 2049, sample length: 2745 +[default0]:Skipping sample id=1124332. Maximum sequence length: 2049, sample length: 2400 +[default0]:Skipping sample id=1414214. Maximum sequence length: 2049, sample length: 4135 +[default0]:Skipping sample id=290851. Maximum sequence length: 2049, sample length: 2663 +[default0]:Skipping sample id=514796. Maximum sequence length: 2049, sample length: 2126 +[default0]:Skipping sample id=1549753. Maximum sequence length: 2049, sample length: 2692 +[default0]:Skipping sample id=116188. Maximum sequence length: 2049, sample length: 4401 +[default0]:Skipping sample id=884043. Maximum sequence length: 2049, sample length: 2833 +[default0]:Skipping sample id=85743. Maximum sequence length: 2049, sample length: 2970 +[default0]:Skipping sample id=1445275. Maximum sequence length: 2049, sample length: 2152 +[default0]:Skipping sample id=90782. Maximum sequence length: 2049, sample length: 4450 +[default0]:Skipping sample id=1357714. Maximum sequence length: 2049, sample length: 4696 +[default0]:Skipping sample id=987863. Maximum sequence length: 2049, sample length: 2836 +[default0]:Skipping sample id=37075. Maximum sequence length: 2049, sample length: 4122 +[default0]:Skipping sample id=246056. Maximum sequence length: 2049, sample length: 2725 +[default0]:Skipping sample id=462879. Maximum sequence length: 2049, sample length: 3182 +[default0]:Skipping sample id=788966. Maximum sequence length: 2049, sample length: 2240 +[default0]:Skipping sample id=202701. Maximum sequence length: 2049, sample length: 2994 +[default0]:Skipping sample id=1073115. Maximum sequence length: 2049, sample length: 5432 +[default0]:Skipping sample id=835121. Maximum sequence length: 2049, sample length: 2420 +[default0]:Skipping sample id=1243542. Maximum sequence length: 2049, sample length: 2980 +[default0]:Skipping sample id=341973. Maximum sequence length: 2049, sample length: 2107 +[default0]:Skipping sample id=402291. Maximum sequence length: 2049, sample length: 2749 +[default0]:Skipping sample id=1380365. Maximum sequence length: 2049, sample length: 2928 +[default0]:Skipping sample id=912223. Maximum sequence length: 2049, sample length: 4224 +[default0]:Skipping sample id=999940. Maximum sequence length: 2049, sample length: 2371 +[default0]:Skipping sample id=326983. Maximum sequence length: 2049, sample length: 2342 +[default0]:Skipping sample id=1094987. Maximum sequence length: 2049, sample length: 2553 +[default0]:Skipping sample id=840883. Maximum sequence length: 2049, sample length: 2509 +[default0]:Skipping sample id=1395992. Maximum sequence length: 2049, sample length: 3152 +[default0]:Skipping sample id=290703. Maximum sequence length: 2049, sample length: 2459 +[default0]:Skipping sample id=88172. Maximum sequence length: 2049, sample length: 2590 +[default0]:Skipping sample id=404649. Maximum sequence length: 2049, sample length: 2520 +[default0]:Skipping sample id=313178. Maximum sequence length: 2049, sample length: 2344 +[default0]:Skipping sample id=160588. Maximum sequence length: 2049, sample length: 2801 +[default0]:Skipping sample id=557924. Maximum sequence length: 2049, sample length: 5345 +[default0]:Skipping sample id=795908. Maximum sequence length: 2049, sample length: 2745 +[default0]:Skipping sample id=893722. Maximum sequence length: 2049, sample length: 2707 +[default0]:Skipping sample id=1269367. Maximum sequence length: 2049, sample length: 3096 +[default0]:Skipping sample id=784975. Maximum sequence length: 2049, sample length: 3716 +[default0]:Skipping sample id=946416. Maximum sequence length: 2049, sample length: 2836 +[default0]:Skipping sample id=1146975. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=1089567. Maximum sequence length: 2049, sample length: 2514 +[default0]:Skipping sample id=420796. Maximum sequence length: 2049, sample length: 3874 +[default0]:Skipping sample id=700921. Maximum sequence length: 2049, sample length: 2430 +[default0]:Skipping sample id=1019200. Maximum sequence length: 2049, sample length: 2087 +[default0]:Skipping sample id=866804. Maximum sequence length: 2049, sample length: 4751 +[default0]:Skipping sample id=647337. Maximum sequence length: 2049, sample length: 2125 +[default0]:Skipping sample id=1189296. Maximum sequence length: 2049, sample length: 2731 +[default0]:Skipping sample id=75110. Maximum sequence length: 2049, sample length: 2081 +[default0]:Skipping sample id=987292. Maximum sequence length: 2049, sample length: 3649 +[default0]:Skipping sample id=1535362. Maximum sequence length: 2049, sample length: 6704 +[default0]:Skipping sample id=554821. Maximum sequence length: 2049, sample length: 2805 +[default0]:Skipping sample id=613098. Maximum sequence length: 2049, sample length: 2655 +[default0]:Skipping sample id=973985. Maximum sequence length: 2049, sample length: 2058 +[default0]:Skipping sample id=1151920. Maximum sequence length: 2049, sample length: 2528 +[default0]:Skipping sample id=1282017. Maximum sequence length: 2049, sample length: 4248 +[default0]:Skipping sample id=1113834. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=1256964. Maximum sequence length: 2049, sample length: 3239 +[default0]:Skipping sample id=1529880. Maximum sequence length: 2049, sample length: 2432 +[default0]:Skipping sample id=293643. Maximum sequence length: 2049, sample length: 3369 +[default0]:Skipping sample id=1509612. Maximum sequence length: 2049, sample length: 2823 +[default0]:Skipping sample id=327012. Maximum sequence length: 2049, sample length: 2755 +[default0]:Skipping sample id=1142722. Maximum sequence length: 2049, sample length: 2611 +[default0]:Skipping sample id=276058. Maximum sequence length: 2049, sample length: 3214 +[default0]:Skipping sample id=998419. Maximum sequence length: 2049, sample length: 2052 +[default0]:Skipping sample id=989889. Maximum sequence length: 2049, sample length: 2631 +[default0]:Skipping sample id=1262048. Maximum sequence length: 2049, sample length: 2678 +[default0]:Skipping sample id=1044225. Maximum sequence length: 2049, sample length: 4479 +[default0]:Skipping sample id=615053. Maximum sequence length: 2049, sample length: 2305 +[default0]:Skipping sample id=315263. Maximum sequence length: 2049, sample length: 2870 +[default0]:Skipping sample id=1227950. Maximum sequence length: 2049, sample length: 2905 +[default0]:Skipping sample id=819083. Maximum sequence length: 2049, sample length: 2137 +[default0]:Skipping sample id=124590. Maximum sequence length: 2049, sample length: 4037 +[default0]:Skipping sample id=32225. Maximum sequence length: 2049, sample length: 2237 +[default0]:Skipping sample id=152193. Maximum sequence length: 2049, sample length: 2528 +[default0]:Skipping sample id=975579. Maximum sequence length: 2049, sample length: 2055 +[default0]:Skipping sample id=994878. Maximum sequence length: 2049, sample length: 2534 +[default0]:Skipping sample id=23110. Maximum sequence length: 2049, sample length: 2522 +[default0]:Skipping sample id=710189. Maximum sequence length: 2049, sample length: 2347 +[default0]:Skipping sample id=915691. Maximum sequence length: 2049, sample length: 3856 +[default0]:Skipping sample id=835632. Maximum sequence length: 2049, sample length: 2568 +[default0]:Skipping sample id=1344629. Maximum sequence length: 2049, sample length: 6165 +[default0]:Skipping sample id=20402. Maximum sequence length: 2049, sample length: 3435 +[default0]:Skipping sample id=891303. Maximum sequence length: 2049, sample length: 2542 +[default0]:Skipping sample id=1065779. Maximum sequence length: 2049, sample length: 2161 +[default0]:Skipping sample id=211888. Maximum sequence length: 2049, sample length: 3459 +[default0]:Skipping sample id=359823. Maximum sequence length: 2049, sample length: 3565 +[default0]:Skipping sample id=356227. Maximum sequence length: 2049, sample length: 4518 +[default0]:Skipping sample id=232523. Maximum sequence length: 2049, sample length: 3155 +[default0]:Skipping sample id=1552444. Maximum sequence length: 2049, sample length: 2728 +[default0]:Skipping sample id=1448641. Maximum sequence length: 2049, sample length: 2325 +[default0]:Skipping sample id=397118. Maximum sequence length: 2049, sample length: 4196 +[default0]:Skipping sample id=709904. Maximum sequence length: 2049, sample length: 4787 +[default0]:Skipping sample id=1154543. Maximum sequence length: 2049, sample length: 3731 +[default0]:Skipping sample id=1342193. Maximum sequence length: 2049, sample length: 4014 +[default0]:Skipping sample id=1231561. Maximum sequence length: 2049, sample length: 4312 +[default0]:Skipping sample id=1208392. Maximum sequence length: 2049, sample length: 2127 +[default0]:Skipping sample id=447507. Maximum sequence length: 2049, sample length: 2072 +[default0]:Skipping sample id=206455. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=953208. Maximum sequence length: 2049, sample length: 2342 +[default0]:Skipping sample id=1390394. Maximum sequence length: 2049, sample length: 3051 +[default0]:Skipping sample id=1284126. Maximum sequence length: 2049, sample length: 2612 +[default0]:Skipping sample id=1453405. Maximum sequence length: 2049, sample length: 2426 +[default0]:Skipping sample id=880473. Maximum sequence length: 2049, sample length: 2444 +[default0]:Skipping sample id=1243368. Maximum sequence length: 2049, sample length: 4265 +[default0]:Skipping sample id=1091317. Maximum sequence length: 2049, sample length: 2561 +[default0]:Skipping sample id=1151713. Maximum sequence length: 2049, sample length: 2204 +[default0]:Skipping sample id=151000. Maximum sequence length: 2049, sample length: 2469 +[default0]:Skipping sample id=170401. Maximum sequence length: 2049, sample length: 3239 +[default0]:Skipping sample id=858084. Maximum sequence length: 2049, sample length: 2621 +[default0]:Skipping sample id=1110988. Maximum sequence length: 2049, sample length: 2055 +[default0]:Skipping sample id=1294815. Maximum sequence length: 2049, sample length: 2153 +[default0]:Skipping sample id=1010253. Maximum sequence length: 2049, sample length: 2816 +[default0]:Skipping sample id=843998. Maximum sequence length: 2049, sample length: 6521 +[default0]:Skipping sample id=402694. Maximum sequence length: 2049, sample length: 2063 +[default0]:Skipping sample id=484659. Maximum sequence length: 2049, sample length: 2181 +[default0]:Skipping sample id=912701. Maximum sequence length: 2049, sample length: 2525 +[default0]:Skipping sample id=1271328. Maximum sequence length: 2049, sample length: 2367 +[default0]:Skipping sample id=458964. Maximum sequence length: 2049, sample length: 2399 +[default0]:Skipping sample id=1200387. Maximum sequence length: 2049, sample length: 3592 +[default0]:Skipping sample id=48746. Maximum sequence length: 2049, sample length: 2625 +[default0]:Skipping sample id=1563869. Maximum sequence length: 2049, sample length: 2669 +[default0]:Skipping sample id=618416. Maximum sequence length: 2049, sample length: 2506 +[default0]:Skipping sample id=1289356. Maximum sequence length: 2049, sample length: 5571 +[default0]:Skipping sample id=676533. Maximum sequence length: 2049, sample length: 3471 +[default0]:Skipping sample id=907806. Maximum sequence length: 2049, sample length: 3059 +[default0]:Skipping sample id=115562. Maximum sequence length: 2049, sample length: 2062 +[default0]:Skipping sample id=761936. Maximum sequence length: 2049, sample length: 2213 +[default0]:Skipping sample id=424302. Maximum sequence length: 2049, sample length: 2449 +[default0]:Skipping sample id=1530017. Maximum sequence length: 2049, sample length: 2741 +[default0]:Skipping sample id=1553969. Maximum sequence length: 2049, sample length: 3598 +[default0]:Skipping sample id=240029. Maximum sequence length: 2049, sample length: 3016 +[default0]:Skipping sample id=302632. Maximum sequence length: 2049, sample length: 2482 +[default0]:Skipping sample id=853362. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=1202855. Maximum sequence length: 2049, sample length: 4047 +[default0]:Skipping sample id=1494584. Maximum sequence length: 2049, sample length: 4823 +[default0]:Skipping sample id=1558942. Maximum sequence length: 2049, sample length: 3720 +[default0]:Skipping sample id=473007. Maximum sequence length: 2049, sample length: 3779 +[default0]:Skipping sample id=1525552. Maximum sequence length: 2049, sample length: 3763 +[default0]:Skipping sample id=793049. Maximum sequence length: 2049, sample length: 3415 +[default0]:Skipping sample id=141553. Maximum sequence length: 2049, sample length: 2086 +[default0]:Skipping sample id=1284607. Maximum sequence length: 2049, sample length: 2322 +[default0]:Skipping sample id=95501. Maximum sequence length: 2049, sample length: 2755 +[default0]:Skipping sample id=822407. Maximum sequence length: 2049, sample length: 2538 +[default0]:Skipping sample id=416069. Maximum sequence length: 2049, sample length: 3123 +[default0]:Skipping sample id=1128330. Maximum sequence length: 2049, sample length: 2177 +[default0]:Skipping sample id=1081828. Maximum sequence length: 2049, sample length: 2283 +[default0]:Skipping sample id=71502. Maximum sequence length: 2049, sample length: 2266 +[default0]:Skipping sample id=908820. Maximum sequence length: 2049, sample length: 2576 +[default0]:Skipping sample id=31316. Maximum sequence length: 2049, sample length: 4910 +[default0]:Skipping sample id=48117. Maximum sequence length: 2049, sample length: 2575 +[default0]:Skipping sample id=737424. Maximum sequence length: 2049, sample length: 3269 +[default0]:Skipping sample id=438133. Maximum sequence length: 2049, sample length: 2691 +[default0]:Skipping sample id=682971. Maximum sequence length: 2049, sample length: 2502 +[default0]:Skipping sample id=403684. Maximum sequence length: 2049, sample length: 2228 +[default0]:Skipping sample id=1146128. Maximum sequence length: 2049, sample length: 3137 +[default0]:Skipping sample id=641382. Maximum sequence length: 2049, sample length: 3979 +[default0]:Skipping sample id=137955. Maximum sequence length: 2049, sample length: 3261 +[default0]:Skipping sample id=577533. Maximum sequence length: 2049, sample length: 2431 +[default0]:Skipping sample id=907487. Maximum sequence length: 2049, sample length: 2651 +[default0]:Skipping sample id=222425. Maximum sequence length: 2049, sample length: 3637 +[default0]:Skipping sample id=303435. Maximum sequence length: 2049, sample length: 2349 +[default0]:Skipping sample id=1351514. Maximum sequence length: 2049, sample length: 2382 +[default0]:Skipping sample id=1355814. Maximum sequence length: 2049, sample length: 2756 +[default0]:Skipping sample id=1068900. Maximum sequence length: 2049, sample length: 3726 +[default0]:Skipping sample id=40728. Maximum sequence length: 2049, sample length: 3676 +[default0]:Skipping sample id=1539162. Maximum sequence length: 2049, sample length: 2743 +[default0]:Skipping sample id=1517923. Maximum sequence length: 2049, sample length: 2706 +[default0]:Skipping sample id=834019. Maximum sequence length: 2049, sample length: 2347 +[default0]:Skipping sample id=641028. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=594017. Maximum sequence length: 2049, sample length: 2327 +[default0]:Skipping sample id=809299. Maximum sequence length: 2049, sample length: 2844 +[default0]:Skipping sample id=1346901. Maximum sequence length: 2049, sample length: 2222 +[default0]:Skipping sample id=208071. Maximum sequence length: 2049, sample length: 2194 +[default0]:Skipping sample id=1489352. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=616176. Maximum sequence length: 2049, sample length: 7151 +[default0]:Skipping sample id=1537978. Maximum sequence length: 2049, sample length: 2159 +[default0]:Skipping sample id=327737. Maximum sequence length: 2049, sample length: 2484 +[default0]:Skipping sample id=328619. Maximum sequence length: 2049, sample length: 2823 +[default0]:Skipping sample id=1066131. Maximum sequence length: 2049, sample length: 2777 +[default0]:Skipping sample id=869886. Maximum sequence length: 2049, sample length: 2772 +[default0]:Skipping sample id=1339147. Maximum sequence length: 2049, sample length: 2381 +[default0]:Skipping sample id=157845. Maximum sequence length: 2049, sample length: 2588 +[default0]:Skipping sample id=520351. Maximum sequence length: 2049, sample length: 4802 +[default0]:Skipping sample id=744489. Maximum sequence length: 2049, sample length: 2474 +[default0]:Skipping sample id=349390. Maximum sequence length: 2049, sample length: 5405 +[default0]:Skipping sample id=145030. Maximum sequence length: 2049, sample length: 2458 +[default0]:Skipping sample id=639744. Maximum sequence length: 2049, sample length: 2340 +[default0]:Skipping sample id=1082606. Maximum sequence length: 2049, sample length: 2473 +[default0]:Skipping sample id=760333. Maximum sequence length: 2049, sample length: 2211 +[default0]:Skipping sample id=1268591. Maximum sequence length: 2049, sample length: 3548 +[default0]:Skipping sample id=1487853. Maximum sequence length: 2049, sample length: 2182 +[default0]:Skipping sample id=364875. Maximum sequence length: 2049, sample length: 2497 +[default0]:Skipping sample id=369589. Maximum sequence length: 2049, sample length: 3228 +[default0]:Skipping sample id=1255682. Maximum sequence length: 2049, sample length: 3207 +[default0]:Skipping sample id=1166188. Maximum sequence length: 2049, sample length: 2223 +[default0]:Skipping sample id=370035. Maximum sequence length: 2049, sample length: 2137 +[default0]:Skipping sample id=878527. Maximum sequence length: 2049, sample length: 2086 +[default0]:Skipping sample id=690610. Maximum sequence length: 2049, sample length: 2511 +[default0]:Skipping sample id=1541657. Maximum sequence length: 2049, sample length: 2599 +[default0]:Skipping sample id=267387. Maximum sequence length: 2049, sample length: 6572 +[default0]:Skipping sample id=1121886. Maximum sequence length: 2049, sample length: 2402 +[default0]:Skipping sample id=1483052. Maximum sequence length: 2049, sample length: 2514 +[default0]:Skipping sample id=435618. Maximum sequence length: 2049, sample length: 3084 +[default0]:Skipping sample id=1441836. Maximum sequence length: 2049, sample length: 3179 +[default0]:Skipping sample id=59532. Maximum sequence length: 2049, sample length: 2836 +[default0]:Skipping sample id=213509. Maximum sequence length: 2049, sample length: 4192 +[default0]:Skipping sample id=207896. Maximum sequence length: 2049, sample length: 2066 +[default0]:Skipping sample id=750952. Maximum sequence length: 2049, sample length: 2138 +[default0]:Skipping sample id=776645. Maximum sequence length: 2049, sample length: 2166 +[default0]:Skipping sample id=554803. Maximum sequence length: 2049, sample length: 2821 +[default0]:Skipping sample id=530002. Maximum sequence length: 2049, sample length: 3422 +[default0]:Skipping sample id=1069638. Maximum sequence length: 2049, sample length: 2295 +[default0]:Skipping sample id=1411204. Maximum sequence length: 2049, sample length: 2058 +[default0]:Skipping sample id=721531. Maximum sequence length: 2049, sample length: 3591 +[default0]:Skipping sample id=44350. Maximum sequence length: 2049, sample length: 2126 +[default0]:Skipping sample id=1327474. Maximum sequence length: 2049, sample length: 2203 +[default0]:Skipping sample id=682951. Maximum sequence length: 2049, sample length: 2114 +[default0]:Skipping sample id=909043. Maximum sequence length: 2049, sample length: 2590 +[default0]:Skipping sample id=1270408. Maximum sequence length: 2049, sample length: 3746 +[default0]:Skipping sample id=1202825. Maximum sequence length: 2049, sample length: 2518 +[default0]:Skipping sample id=1122615. Maximum sequence length: 2049, sample length: 2427 +[default0]:Skipping sample id=742429. Maximum sequence length: 2049, sample length: 3083 +[default0]:Skipping sample id=288287. Maximum sequence length: 2049, sample length: 2273 +[default0]:Skipping sample id=241557. Maximum sequence length: 2049, sample length: 3023 +[default0]:Skipping sample id=83957. Maximum sequence length: 2049, sample length: 3233 +[default0]:Skipping sample id=1365620. Maximum sequence length: 2049, sample length: 2911 +[default0]:Skipping sample id=436539. Maximum sequence length: 2049, sample length: 3829 +[default0]:Skipping sample id=1045744. Maximum sequence length: 2049, sample length: 2862 +[default0]:Skipping sample id=1511173. Maximum sequence length: 2049, sample length: 2130 +[default0]:Skipping sample id=711758. Maximum sequence length: 2049, sample length: 2173 +[default0]:Skipping sample id=618822. Maximum sequence length: 2049, sample length: 2969 +[default0]:Skipping sample id=1557616. Maximum sequence length: 2049, sample length: 6375 +[default0]:Skipping sample id=705382. Maximum sequence length: 2049, sample length: 2359 +[default0]:Skipping sample id=64983. Maximum sequence length: 2049, sample length: 2322 +[default0]:Skipping sample id=375419. Maximum sequence length: 2049, sample length: 3788 +[default0]:Skipping sample id=1097452. Maximum sequence length: 2049, sample length: 2551 +[default0]:Skipping sample id=1398828. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=1484390. Maximum sequence length: 2049, sample length: 2461 +[default0]:Skipping sample id=1124793. Maximum sequence length: 2049, sample length: 2397 +[default0]:Skipping sample id=933963. Maximum sequence length: 2049, sample length: 2908 +[default0]:Skipping sample id=412321. Maximum sequence length: 2049, sample length: 3368 +[default0]:Skipping sample id=1479184. Maximum sequence length: 2049, sample length: 2549 +[default0]:Skipping sample id=607259. Maximum sequence length: 2049, sample length: 2068 +[default0]:Skipping sample id=174747. Maximum sequence length: 2049, sample length: 3194 +[default0]:Skipping sample id=176466. Maximum sequence length: 2049, sample length: 2160 +[default0]:Skipping sample id=1237228. Maximum sequence length: 2049, sample length: 5813 +[default0]:Skipping sample id=30337. Maximum sequence length: 2049, sample length: 2752 +[default0]:Skipping sample id=1137343. Maximum sequence length: 2049, sample length: 2200 +[default0]:Skipping sample id=1085724. Maximum sequence length: 2049, sample length: 5180 +[default0]:Skipping sample id=1507283. Maximum sequence length: 2049, sample length: 2492 +[default0]:Skipping sample id=124554. Maximum sequence length: 2049, sample length: 3373 +[default0]:Skipping sample id=1208941. Maximum sequence length: 2049, sample length: 5611 +[default0]:Skipping sample id=589167. Maximum sequence length: 2049, sample length: 2072 +[default0]:Skipping sample id=81161. Maximum sequence length: 2049, sample length: 2358 +[default0]:Skipping sample id=51210. Maximum sequence length: 2049, sample length: 4587 +[default0]:Skipping sample id=1038115. Maximum sequence length: 2049, sample length: 4571 +[default0]:Skipping sample id=259496. Maximum sequence length: 2049, sample length: 2507 +[default0]:Skipping sample id=1505832. Maximum sequence length: 2049, sample length: 2415 +[default0]:Skipping sample id=1095009. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=959576. Maximum sequence length: 2049, sample length: 3634 +[default0]:Skipping sample id=1443134. Maximum sequence length: 2049, sample length: 3410 +[default0]:Skipping sample id=488995. Maximum sequence length: 2049, sample length: 2976 +[default0]:Skipping sample id=769180. Maximum sequence length: 2049, sample length: 3168 +[default0]:Skipping sample id=186525. Maximum sequence length: 2049, sample length: 2869 +[default0]:Skipping sample id=1312448. Maximum sequence length: 2049, sample length: 2228 +[default0]:Skipping sample id=1023661. Maximum sequence length: 2049, sample length: 2081 +[default0]:Skipping sample id=1331864. Maximum sequence length: 2049, sample length: 2831 +[default0]:Skipping sample id=222298. Maximum sequence length: 2049, sample length: 2431 +[default0]:Skipping sample id=1451856. Maximum sequence length: 2049, sample length: 4750 +[default0]:Skipping sample id=338196. Maximum sequence length: 2049, sample length: 2225 +[default0]:Skipping sample id=75773. Maximum sequence length: 2049, sample length: 3268 +[default0]:Skipping sample id=1063395. Maximum sequence length: 2049, sample length: 2557 +[default0]:Skipping sample id=647547. Maximum sequence length: 2049, sample length: 4265 +[default0]:Skipping sample id=816664. Maximum sequence length: 2049, sample length: 2466 +[default0]:Skipping sample id=786774. Maximum sequence length: 2049, sample length: 2607 +[default0]:Skipping sample id=864908. Maximum sequence length: 2049, sample length: 4701 +[default0]:Skipping sample id=213156. Maximum sequence length: 2049, sample length: 3388 +[default0]:Skipping sample id=1247909. Maximum sequence length: 2049, sample length: 4035 +[default0]:Skipping sample id=885613. Maximum sequence length: 2049, sample length: 3228 +[default0]:Skipping sample id=1338236. Maximum sequence length: 2049, sample length: 5854 +[default0]:Skipping sample id=170387. Maximum sequence length: 2049, sample length: 3101 +[default0]:Skipping sample id=58002. Maximum sequence length: 2049, sample length: 2341 +[default0]:Skipping sample id=183646. Maximum sequence length: 2049, sample length: 3668 +[default0]:Skipping sample id=1219514. Maximum sequence length: 2049, sample length: 2173 +[default0]:Skipping sample id=1059649. Maximum sequence length: 2049, sample length: 2334 +[default0]:Skipping sample id=388738. Maximum sequence length: 2049, sample length: 3765 +[default0]:Skipping sample id=716959. Maximum sequence length: 2049, sample length: 2674 +[default0]:Skipping sample id=283597. Maximum sequence length: 2049, sample length: 5515 +[default0]:Skipping sample id=577776. Maximum sequence length: 2049, sample length: 2062 +[default0]:Skipping sample id=1563967. Maximum sequence length: 2049, sample length: 2053 +[default0]:Skipping sample id=899833. Maximum sequence length: 2049, sample length: 2485 +[default0]:Skipping sample id=1125033. Maximum sequence length: 2049, sample length: 2157 +[default0]:Skipping sample id=756392. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=731966. Maximum sequence length: 2049, sample length: 3008 +[default0]:Skipping sample id=104497. Maximum sequence length: 2049, sample length: 2665 +[default0]:Skipping sample id=1321660. Maximum sequence length: 2049, sample length: 2109 +[default0]:Skipping sample id=990563. Maximum sequence length: 2049, sample length: 2809 +[default0]:Skipping sample id=973649. Maximum sequence length: 2049, sample length: 2912 +[default0]:Skipping sample id=1384683. Maximum sequence length: 2049, sample length: 5149 +[default0]:Skipping sample id=903210. Maximum sequence length: 2049, sample length: 2260 +[default0]:Skipping sample id=1188295. Maximum sequence length: 2049, sample length: 2590 +[default0]:Skipping sample id=350212. Maximum sequence length: 2049, sample length: 4671 +[default0]:Skipping sample id=922356. Maximum sequence length: 2049, sample length: 2143 +[default0]:Skipping sample id=86271. Maximum sequence length: 2049, sample length: 2052 +[default0]:Skipping sample id=162460. Maximum sequence length: 2049, sample length: 2422 +[default0]:Skipping sample id=84177. Maximum sequence length: 2049, sample length: 3654 +[default0]:Skipping sample id=1369823. Maximum sequence length: 2049, sample length: 8982 +[default0]:Skipping sample id=406218. Maximum sequence length: 2049, sample length: 2435 +[default0]:Skipping sample id=131284. Maximum sequence length: 2049, sample length: 2662 +[default0]:Skipping sample id=7685. Maximum sequence length: 2049, sample length: 2358 +[default0]:Skipping sample id=601911. Maximum sequence length: 2049, sample length: 2610 +[default0]:Skipping sample id=734577. Maximum sequence length: 2049, sample length: 2585 +[default0]:Skipping sample id=1128253. Maximum sequence length: 2049, sample length: 2118 +[default0]:Skipping sample id=1380591. Maximum sequence length: 2049, sample length: 2549 +[default0]:Skipping sample id=1374452. Maximum sequence length: 2049, sample length: 2721 +[default0]:Skipping sample id=938841. Maximum sequence length: 2049, sample length: 2763 +[default0]:Skipping sample id=1066602. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=30656. Maximum sequence length: 2049, sample length: 2086 +[default0]:Skipping sample id=85031. Maximum sequence length: 2049, sample length: 2455 +[default0]:Skipping sample id=971694. Maximum sequence length: 2049, sample length: 2537 +[default0]:Skipping sample id=1140528. Maximum sequence length: 2049, sample length: 2645 +[default0]:Skipping sample id=43811. Maximum sequence length: 2049, sample length: 3180 +[default0]:Skipping sample id=209732. Maximum sequence length: 2049, sample length: 2113 +[default0]:Skipping sample id=968384. Maximum sequence length: 2049, sample length: 2626 +[default0]:Skipping sample id=561301. Maximum sequence length: 2049, sample length: 2110 +[default0]:Skipping sample id=534595. Maximum sequence length: 2049, sample length: 4042 +[default0]:Skipping sample id=614428. Maximum sequence length: 2049, sample length: 4721 +[default0]:Skipping sample id=197923. Maximum sequence length: 2049, sample length: 2612 +[default0]:Skipping sample id=322389. Maximum sequence length: 2049, sample length: 2298 +[default0]:Skipping sample id=1453400. Maximum sequence length: 2049, sample length: 2798 +[default0]:Skipping sample id=386018. Maximum sequence length: 2049, sample length: 2372 +[default0]:Skipping sample id=570065. Maximum sequence length: 2049, sample length: 3898 +[default0]:Skipping sample id=656267. Maximum sequence length: 2049, sample length: 2390 +[default0]:Skipping sample id=819257. Maximum sequence length: 2049, sample length: 2511 +[default0]:Skipping sample id=1239535. Maximum sequence length: 2049, sample length: 2355 +[default0]:Skipping sample id=884249. Maximum sequence length: 2049, sample length: 2060 +[default0]:Skipping sample id=441010. Maximum sequence length: 2049, sample length: 2321 +[default0]:Skipping sample id=933779. Maximum sequence length: 2049, sample length: 2660 +[default0]:Skipping sample id=1022630. Maximum sequence length: 2049, sample length: 3129 +[default0]:Skipping sample id=751264. Maximum sequence length: 2049, sample length: 3196 +[default0]:Skipping sample id=1433083. Maximum sequence length: 2049, sample length: 2695 +[default0]:Skipping sample id=578739. Maximum sequence length: 2049, sample length: 2461 +[default0]:Skipping sample id=339454. Maximum sequence length: 2049, sample length: 2134 +[default0]:Skipping sample id=1572279. Maximum sequence length: 2049, sample length: 4573 +[default0]:Skipping sample id=1265007. Maximum sequence length: 2049, sample length: 2854 +[default0]:Skipping sample id=318307. Maximum sequence length: 2049, sample length: 2544 +[default0]:Skipping sample id=716457. Maximum sequence length: 2049, sample length: 4618 +[default0]:Skipping sample id=307884. Maximum sequence length: 2049, sample length: 2622 +[default0]:Skipping sample id=274638. Maximum sequence length: 2049, sample length: 2674 +[default0]:Skipping sample id=1402357. Maximum sequence length: 2049, sample length: 3571 +[default0]:Skipping sample id=474915. Maximum sequence length: 2049, sample length: 2919 +[default0]:Skipping sample id=706663. Maximum sequence length: 2049, sample length: 2529 +[default0]:Skipping sample id=1362787. Maximum sequence length: 2049, sample length: 3900 +[default0]:Skipping sample id=1469013. Maximum sequence length: 2049, sample length: 2742 +[default0]:Skipping sample id=751594. Maximum sequence length: 2049, sample length: 2107 +[default0]:Skipping sample id=306196. Maximum sequence length: 2049, sample length: 2390 +[default0]:Skipping sample id=1396223. Maximum sequence length: 2049, sample length: 2791 +[default0]:Skipping sample id=1095729. Maximum sequence length: 2049, sample length: 2525 +[default0]:Skipping sample id=1210642. Maximum sequence length: 2049, sample length: 2082 +[default0]:Skipping sample id=529589. Maximum sequence length: 2049, sample length: 2225 +[default0]:Skipping sample id=1439693. Maximum sequence length: 2049, sample length: 2692 +[default0]:Skipping sample id=524606. Maximum sequence length: 2049, sample length: 3507 +[default0]:Skipping sample id=981856. Maximum sequence length: 2049, sample length: 2857 +[default0]:Skipping sample id=1403081. Maximum sequence length: 2049, sample length: 3268 +[default0]:Skipping sample id=1785. Maximum sequence length: 2049, sample length: 2587 +[default0]:Skipping sample id=856373. Maximum sequence length: 2049, sample length: 2375 +[default0]:Skipping sample id=1408774. Maximum sequence length: 2049, sample length: 2152 +[default0]:Skipping sample id=1299903. Maximum sequence length: 2049, sample length: 2142 +[default0]:Skipping sample id=381002. Maximum sequence length: 2049, sample length: 2913 +[default0]:Skipping sample id=1510755. Maximum sequence length: 2049, sample length: 3646 +[default0]:Skipping sample id=629603. Maximum sequence length: 2049, sample length: 3026 +[default0]:Skipping sample id=1234986. Maximum sequence length: 2049, sample length: 2236 +[default0]:Skipping sample id=1347832. Maximum sequence length: 2049, sample length: 7304 +[default0]:Skipping sample id=1341729. Maximum sequence length: 2049, sample length: 4054 +[default0]:Skipping sample id=152603. Maximum sequence length: 2049, sample length: 6950 +[default0]:Skipping sample id=261615. Maximum sequence length: 2049, sample length: 3149 +[default0]:Skipping sample id=600644. Maximum sequence length: 2049, sample length: 3509 +[default0]:Skipping sample id=271577. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=1223427. Maximum sequence length: 2049, sample length: 2262 +[default0]:Skipping sample id=705760. Maximum sequence length: 2049, sample length: 2952 +[default0]:Skipping sample id=983081. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=242561. Maximum sequence length: 2049, sample length: 2522 +[default0]:Skipping sample id=616102. Maximum sequence length: 2049, sample length: 4125 +[default0]:Skipping sample id=1143298. Maximum sequence length: 2049, sample length: 2897 +[default0]:Skipping sample id=1081100. Maximum sequence length: 2049, sample length: 3984 +[default0]:Skipping sample id=771853. Maximum sequence length: 2049, sample length: 3148 +[default0]:Skipping sample id=960370. Maximum sequence length: 2049, sample length: 2392 +[default0]:Skipping sample id=1171448. Maximum sequence length: 2049, sample length: 2686 +[default0]:Skipping sample id=91347. Maximum sequence length: 2049, sample length: 3666 +[default0]:Skipping sample id=252140. Maximum sequence length: 2049, sample length: 2124 +[default0]:Skipping sample id=919514. Maximum sequence length: 2049, sample length: 2305 +[default0]:Skipping sample id=1137160. Maximum sequence length: 2049, sample length: 4205 +[default0]:Skipping sample id=1278200. Maximum sequence length: 2049, sample length: 2817 +[default0]:Skipping sample id=788. Maximum sequence length: 2049, sample length: 3723 +[default0]:Skipping sample id=1225164. Maximum sequence length: 2049, sample length: 3426 +[default0]:Skipping sample id=1328688. Maximum sequence length: 2049, sample length: 2161 +[default0]:Skipping sample id=1173897. Maximum sequence length: 2049, sample length: 2500 +[default0]:Skipping sample id=1539337. Maximum sequence length: 2049, sample length: 2680 +[default0]:Skipping sample id=1461045. Maximum sequence length: 2049, sample length: 2396 +[default0]:Skipping sample id=248131. Maximum sequence length: 2049, sample length: 2395 +[default0]:Skipping sample id=1312303. Maximum sequence length: 2049, sample length: 2199 +[default0]:Skipping sample id=1272613. Maximum sequence length: 2049, sample length: 2064 +[default0]:Skipping sample id=34527. Maximum sequence length: 2049, sample length: 3916 +[default0]:Skipping sample id=295787. Maximum sequence length: 2049, sample length: 3585 +[default0]:Skipping sample id=205571. Maximum sequence length: 2049, sample length: 3597 +[default0]:Skipping sample id=1551711. Maximum sequence length: 2049, sample length: 2400 +[default0]:Skipping sample id=1033366. Maximum sequence length: 2049, sample length: 2501 +[default0]:Skipping sample id=616200. Maximum sequence length: 2049, sample length: 3568 +[default0]:Skipping sample id=694551. Maximum sequence length: 2049, sample length: 3161 +[default0]:Skipping sample id=509290. Maximum sequence length: 2049, sample length: 2894 +[default0]:Skipping sample id=1529693. Maximum sequence length: 2049, sample length: 3274 +[default0]:Skipping sample id=1305177. Maximum sequence length: 2049, sample length: 3430 +[default0]:Skipping sample id=1024448. Maximum sequence length: 2049, sample length: 2182 +[default0]:Skipping sample id=1425717. Maximum sequence length: 2049, sample length: 2277 +[default0]:Skipping sample id=451429. Maximum sequence length: 2049, sample length: 5085 +[default0]:Skipping sample id=1107611. Maximum sequence length: 2049, sample length: 2924 +[default0]:Skipping sample id=39355. Maximum sequence length: 2049, sample length: 2911 +[default0]:Skipping sample id=1339948. Maximum sequence length: 2049, sample length: 2460 +[default0]:Skipping sample id=1552477. Maximum sequence length: 2049, sample length: 3106 +[default0]:Skipping sample id=553896. Maximum sequence length: 2049, sample length: 2684 +[default0]:Skipping sample id=1348221. Maximum sequence length: 2049, sample length: 3056 +[default0]:Skipping sample id=220419. Maximum sequence length: 2049, sample length: 2471 +[default0]:Skipping sample id=443643. Maximum sequence length: 2049, sample length: 2324 +[default0]:Skipping sample id=1149912. Maximum sequence length: 2049, sample length: 5750 +[default0]:Skipping sample id=729203. Maximum sequence length: 2049, sample length: 2332 +[default0]:Skipping sample id=568296. Maximum sequence length: 2049, sample length: 3414 +[default0]:Skipping sample id=417222. Maximum sequence length: 2049, sample length: 2227 +[default0]:Skipping sample id=74297. Maximum sequence length: 2049, sample length: 2296 +[default0]:Skipping sample id=566926. Maximum sequence length: 2049, sample length: 2207 +[default0]:Skipping sample id=397295. Maximum sequence length: 2049, sample length: 5776 +[default0]:Skipping sample id=1562990. Maximum sequence length: 2049, sample length: 3246 +[default0]:Skipping sample id=164055. Maximum sequence length: 2049, sample length: 2087 +[default0]:Skipping sample id=502696. Maximum sequence length: 2049, sample length: 5895 +[default0]:Skipping sample id=207358. Maximum sequence length: 2049, sample length: 2821 +[default0]:Skipping sample id=696657. Maximum sequence length: 2049, sample length: 2854 +[default0]:Skipping sample id=685238. Maximum sequence length: 2049, sample length: 2579 +[default0]:Skipping sample id=1018426. Maximum sequence length: 2049, sample length: 2123 +[default0]:Skipping sample id=1449981. Maximum sequence length: 2049, sample length: 3756 +[default0]:Skipping sample id=790488. Maximum sequence length: 2049, sample length: 3092 +[default0]:Skipping sample id=939669. Maximum sequence length: 2049, sample length: 2543 +[default0]:Skipping sample id=629047. Maximum sequence length: 2049, sample length: 2148 +[default0]:Skipping sample id=1244987. Maximum sequence length: 2049, sample length: 2270 +[default0]:Skipping sample id=413564. Maximum sequence length: 2049, sample length: 3138 +[default0]:Skipping sample id=236089. Maximum sequence length: 2049, sample length: 2583 +[default0]:Skipping sample id=1290318. Maximum sequence length: 2049, sample length: 2798 +[default0]:Skipping sample id=1372884. Maximum sequence length: 2049, sample length: 2339 +[default0]:Skipping sample id=311089. Maximum sequence length: 2049, sample length: 2289 +[default0]:Skipping sample id=769909. Maximum sequence length: 2049, sample length: 3256 +[default0]:Skipping sample id=1176755. Maximum sequence length: 2049, sample length: 3429 +[default0]:Skipping sample id=244542. Maximum sequence length: 2049, sample length: 2429 +[default0]:Skipping sample id=827935. Maximum sequence length: 2049, sample length: 2146 +[default0]:Skipping sample id=1286981. Maximum sequence length: 2049, sample length: 2951 +[default0]:Skipping sample id=601564. Maximum sequence length: 2049, sample length: 5947 +[default0]:Skipping sample id=366597. Maximum sequence length: 2049, sample length: 3795 +[default0]:Skipping sample id=13703. Maximum sequence length: 2049, sample length: 2129 +[default0]:Skipping sample id=1396345. Maximum sequence length: 2049, sample length: 3266 +[default0]:Skipping sample id=520752. Maximum sequence length: 2049, sample length: 4154 +[default0]:Skipping sample id=1173944. Maximum sequence length: 2049, sample length: 3492 +[default0]:Skipping sample id=1486393. Maximum sequence length: 2049, sample length: 2520 +[default0]:Skipping sample id=1208340. Maximum sequence length: 2049, sample length: 4099 +[default0]:Skipping sample id=917518. Maximum sequence length: 2049, sample length: 4756 +[default0]:Skipping sample id=893374. Maximum sequence length: 2049, sample length: 2066 +[default0]:Skipping sample id=12832. Maximum sequence length: 2049, sample length: 3862 +[default0]:Skipping sample id=885093. Maximum sequence length: 2049, sample length: 2663 +[default0]:Skipping sample id=490587. Maximum sequence length: 2049, sample length: 2156 +[default0]:Skipping sample id=558916. Maximum sequence length: 2049, sample length: 2288 +[default0]:Skipping sample id=866643. Maximum sequence length: 2049, sample length: 5149 +[default0]:Skipping sample id=987216. Maximum sequence length: 2049, sample length: 2720 +[default0]:Skipping sample id=1393224. Maximum sequence length: 2049, sample length: 4079 +[default0]:Skipping sample id=37791. Maximum sequence length: 2049, sample length: 2331 +[default0]:Skipping sample id=145154. Maximum sequence length: 2049, sample length: 2635 +[default0]:Skipping sample id=545820. Maximum sequence length: 2049, sample length: 2797 +[default0]:Skipping sample id=558371. Maximum sequence length: 2049, sample length: 2837 +[default0]:Skipping sample id=319959. Maximum sequence length: 2049, sample length: 2399 +[default0]:Skipping sample id=419015. Maximum sequence length: 2049, sample length: 2244 +[default0]:Skipping sample id=504723. Maximum sequence length: 2049, sample length: 2496 +[default0]:Skipping sample id=264445. Maximum sequence length: 2049, sample length: 2204 +[default0]:Skipping sample id=91681. Maximum sequence length: 2049, sample length: 2920 +[default0]:Skipping sample id=625812. Maximum sequence length: 2049, sample length: 2935 +[default0]:Skipping sample id=1258090. Maximum sequence length: 2049, sample length: 3617 +[default0]:Skipping sample id=834480. Maximum sequence length: 2049, sample length: 2586 +[default0]:Skipping sample id=1271184. Maximum sequence length: 2049, sample length: 3999 +[default0]:Skipping sample id=850332. Maximum sequence length: 2049, sample length: 2437 +[default0]:Skipping sample id=256901. Maximum sequence length: 2049, sample length: 2231 +[default0]:Skipping sample id=139137. Maximum sequence length: 2049, sample length: 2214 +[default0]:Skipping sample id=693576. Maximum sequence length: 2049, sample length: 2372 +[default0]:Skipping sample id=655676. Maximum sequence length: 2049, sample length: 2782 +[default0]:Skipping sample id=706737. Maximum sequence length: 2049, sample length: 2825 +[default0]:Skipping sample id=210575. Maximum sequence length: 2049, sample length: 2692 +[default0]:Skipping sample id=699119. Maximum sequence length: 2049, sample length: 2368 +[default0]:Skipping sample id=1491357. Maximum sequence length: 2049, sample length: 2429 +[default0]:Skipping sample id=1270314. Maximum sequence length: 2049, sample length: 3166 +[default0]:Skipping sample id=1096980. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=688377. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=267089. Maximum sequence length: 2049, sample length: 2883 +[default0]:Skipping sample id=480835. Maximum sequence length: 2049, sample length: 2658 +[default0]:Skipping sample id=5304. Maximum sequence length: 2049, sample length: 2718 +[default0]:Skipping sample id=962151. Maximum sequence length: 2049, sample length: 4326 +[default0]:Skipping sample id=137674. Maximum sequence length: 2049, sample length: 2645 +[default0]:Skipping sample id=930849. Maximum sequence length: 2049, sample length: 2428 +[default0]:Skipping sample id=530388. Maximum sequence length: 2049, sample length: 3607 +[default0]:Skipping sample id=56036. Maximum sequence length: 2049, sample length: 2145 +[default0]:Skipping sample id=375023. Maximum sequence length: 2049, sample length: 2651 +[default0]:Skipping sample id=564469. Maximum sequence length: 2049, sample length: 2249 +[default0]:Skipping sample id=51531. Maximum sequence length: 2049, sample length: 3075 +[default0]:Skipping sample id=1146255. Maximum sequence length: 2049, sample length: 4027 +[default0]:Skipping sample id=1034464. Maximum sequence length: 2049, sample length: 2079 +[default0]:Skipping sample id=162527. Maximum sequence length: 2049, sample length: 3014 +[default0]:Skipping sample id=1080429. Maximum sequence length: 2049, sample length: 2516 +[default0]:Skipping sample id=89635. Maximum sequence length: 2049, sample length: 3216 +[default0]:Skipping sample id=1053257. Maximum sequence length: 2049, sample length: 5002 +[default0]:Skipping sample id=1044131. Maximum sequence length: 2049, sample length: 4254 +[default0]:Skipping sample id=1465276. Maximum sequence length: 2049, sample length: 2724 +[default0]:Skipping sample id=312502. Maximum sequence length: 2049, sample length: 2215 +[default0]:Skipping sample id=464841. Maximum sequence length: 2049, sample length: 2126 +[default0]:Skipping sample id=915214. Maximum sequence length: 2049, sample length: 2700 +[default0]:Skipping sample id=1019981. Maximum sequence length: 2049, sample length: 2671 +[default0]:Skipping sample id=421355. Maximum sequence length: 2049, sample length: 3591 +[default0]:Skipping sample id=265679. Maximum sequence length: 2049, sample length: 2231 +[default0]:Skipping sample id=719962. Maximum sequence length: 2049, sample length: 2596 +[default0]:Skipping sample id=19095. Maximum sequence length: 2049, sample length: 2693 +[default0]:Skipping sample id=486314. Maximum sequence length: 2049, sample length: 3811 +[default0]:Skipping sample id=437345. Maximum sequence length: 2049, sample length: 2359 +[default0]:Skipping sample id=523415. Maximum sequence length: 2049, sample length: 3359 +[default0]:Skipping sample id=1173495. Maximum sequence length: 2049, sample length: 2203 +[default0]:Skipping sample id=283674. Maximum sequence length: 2049, sample length: 2126 +[default0]:Skipping sample id=345394. Maximum sequence length: 2049, sample length: 2097 +[default0]:Skipping sample id=205782. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=1049152. Maximum sequence length: 2049, sample length: 2267 +[default0]:Skipping sample id=461. Maximum sequence length: 2049, sample length: 4892 +[default0]:Skipping sample id=39598. Maximum sequence length: 2049, sample length: 2139 +[default0]:Skipping sample id=1567669. Maximum sequence length: 2049, sample length: 2354 +[default0]:Skipping sample id=879735. Maximum sequence length: 2049, sample length: 2493 +[default0]:Skipping sample id=1447767. Maximum sequence length: 2049, sample length: 3056 +[default0]:Skipping sample id=564745. Maximum sequence length: 2049, sample length: 3090 +[default0]:Skipping sample id=352970. Maximum sequence length: 2049, sample length: 4150 +[default0]:Skipping sample id=1355701. Maximum sequence length: 2049, sample length: 2073 +[default0]:Skipping sample id=251561. Maximum sequence length: 2049, sample length: 2265 +[default0]:Skipping sample id=440957. Maximum sequence length: 2049, sample length: 3855 +[default0]:Skipping sample id=300827. Maximum sequence length: 2049, sample length: 3794 +[default0]:Skipping sample id=356375. Maximum sequence length: 2049, sample length: 2889 +[default0]:Skipping sample id=531980. Maximum sequence length: 2049, sample length: 4428 +[default0]:Skipping sample id=283728. Maximum sequence length: 2049, sample length: 3472 +[default0]:Skipping sample id=1197820. Maximum sequence length: 2049, sample length: 2246 +[default0]:Skipping sample id=1464656. Maximum sequence length: 2049, sample length: 2128 +[default0]:Skipping sample id=265983. Maximum sequence length: 2049, sample length: 4023 +[default0]:Skipping sample id=563832. Maximum sequence length: 2049, sample length: 2829 +[default0]:Skipping sample id=12705. Maximum sequence length: 2049, sample length: 3419 +[default0]:Skipping sample id=238093. Maximum sequence length: 2049, sample length: 2836 +[default0]:Skipping sample id=1318779. Maximum sequence length: 2049, sample length: 2499 +[default0]:Skipping sample id=704384. Maximum sequence length: 2049, sample length: 2788 +[default0]:Skipping sample id=1085298. Maximum sequence length: 2049, sample length: 4193 +[default0]:Skipping sample id=675980. Maximum sequence length: 2049, sample length: 2604 +[default0]:Skipping sample id=1503871. Maximum sequence length: 2049, sample length: 5829 +[default0]:Skipping sample id=999760. Maximum sequence length: 2049, sample length: 2395 +[default0]:Skipping sample id=1532761. Maximum sequence length: 2049, sample length: 4110 +[default0]:Skipping sample id=556188. Maximum sequence length: 2049, sample length: 3137 +[default0]:Skipping sample id=870245. Maximum sequence length: 2049, sample length: 2616 +[default0]:Skipping sample id=1012037. Maximum sequence length: 2049, sample length: 3329 +[default0]:Skipping sample id=670113. Maximum sequence length: 2049, sample length: 3060 +[default0]:Skipping sample id=1227003. Maximum sequence length: 2049, sample length: 2161 +[default0]:Skipping sample id=381900. Maximum sequence length: 2049, sample length: 8236 +[default0]:Skipping sample id=216731. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=43611. Maximum sequence length: 2049, sample length: 2976 +[default0]:Skipping sample id=609417. Maximum sequence length: 2049, sample length: 2517 +[default0]:Skipping sample id=937585. Maximum sequence length: 2049, sample length: 2209 +[default0]:Skipping sample id=808392. Maximum sequence length: 2049, sample length: 2143 +[default0]:Skipping sample id=997335. Maximum sequence length: 2049, sample length: 4969 +[default0]:Skipping sample id=923535. Maximum sequence length: 2049, sample length: 2990 +[default0]:Skipping sample id=1301448. Maximum sequence length: 2049, sample length: 2376 +[default0]:Skipping sample id=459919. Maximum sequence length: 2049, sample length: 2242 +[default0]:Skipping sample id=180260. Maximum sequence length: 2049, sample length: 2209 +[default0]:Skipping sample id=1515844. Maximum sequence length: 2049, sample length: 2705 +[default0]:Skipping sample id=1184083. Maximum sequence length: 2049, sample length: 2715 +[default0]:Skipping sample id=1384918. Maximum sequence length: 2049, sample length: 2350 +[default0]:Skipping sample id=147653. Maximum sequence length: 2049, sample length: 6514 +[default0]:Skipping sample id=236008. Maximum sequence length: 2049, sample length: 2105 +[default0]:Skipping sample id=1380195. Maximum sequence length: 2049, sample length: 3485 +[default0]:Skipping sample id=1371649. Maximum sequence length: 2049, sample length: 2659 +[default0]:Skipping sample id=1426047. Maximum sequence length: 2049, sample length: 3224 +[default0]:Skipping sample id=41529. Maximum sequence length: 2049, sample length: 3451 +[default0]:Skipping sample id=774750. Maximum sequence length: 2049, sample length: 3282 +[default0]:Skipping sample id=97555. Maximum sequence length: 2049, sample length: 3994 +[default0]:Skipping sample id=1343303. Maximum sequence length: 2049, sample length: 3950 +[default0]:Skipping sample id=1130759. Maximum sequence length: 2049, sample length: 3730 +[default0]:Skipping sample id=1048047. Maximum sequence length: 2049, sample length: 2060 +[default0]:Skipping sample id=903525. Maximum sequence length: 2049, sample length: 2288 +[default0]:Skipping sample id=760918. Maximum sequence length: 2049, sample length: 3097 +[default0]:Skipping sample id=1467096. Maximum sequence length: 2049, sample length: 3531 +[default0]:Skipping sample id=813710. Maximum sequence length: 2049, sample length: 2435 +[default0]:Skipping sample id=519255. Maximum sequence length: 2049, sample length: 2509 +[default0]:Skipping sample id=1217139. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=1368100. Maximum sequence length: 2049, sample length: 2546 +[default0]:Skipping sample id=36809. Maximum sequence length: 2049, sample length: 2272 +[default0]:Skipping sample id=355816. Maximum sequence length: 2049, sample length: 4245 +[default0]:Skipping sample id=259089. Maximum sequence length: 2049, sample length: 2914 +[default0]:Skipping sample id=434765. Maximum sequence length: 2049, sample length: 2571 +[default0]:Skipping sample id=1195175. Maximum sequence length: 2049, sample length: 2682 +[default0]:Skipping sample id=210016. Maximum sequence length: 2049, sample length: 2995 +[default0]:Skipping sample id=812772. Maximum sequence length: 2049, sample length: 2213 +[default0]:Skipping sample id=2060. Maximum sequence length: 2049, sample length: 2743 +[default0]:Skipping sample id=1133922. Maximum sequence length: 2049, sample length: 2645 +[default0]:Skipping sample id=824071. Maximum sequence length: 2049, sample length: 2336 +[default0]:Skipping sample id=1374764. Maximum sequence length: 2049, sample length: 3156 +[default0]:Skipping sample id=1033825. Maximum sequence length: 2049, sample length: 2340 +[default0]:Skipping sample id=1074794. Maximum sequence length: 2049, sample length: 5858 +[default0]:Skipping sample id=562822. Maximum sequence length: 2049, sample length: 3189 +[default0]:Skipping sample id=414000. Maximum sequence length: 2049, sample length: 2158 +[default0]:Skipping sample id=180149. Maximum sequence length: 2049, sample length: 2914 +[default0]:Skipping sample id=1048460. Maximum sequence length: 2049, sample length: 2551 +[default0]:Skipping sample id=843676. Maximum sequence length: 2049, sample length: 2776 +[default0]:Skipping sample id=527642. Maximum sequence length: 2049, sample length: 2749 +[default0]:Skipping sample id=247817. Maximum sequence length: 2049, sample length: 2526 +[default0]:Skipping sample id=579807. Maximum sequence length: 2049, sample length: 2194 +[default0]:Skipping sample id=1510663. Maximum sequence length: 2049, sample length: 2132 +[default0]:Skipping sample id=937299. Maximum sequence length: 2049, sample length: 2235 +[default0]:Skipping sample id=982642. Maximum sequence length: 2049, sample length: 3366 +[default0]:Skipping sample id=1454336. Maximum sequence length: 2049, sample length: 2074 +[default0]:Skipping sample id=1236423. Maximum sequence length: 2049, sample length: 2928 +[default0]:Skipping sample id=1452915. Maximum sequence length: 2049, sample length: 3068 +[default0]:Skipping sample id=735824. Maximum sequence length: 2049, sample length: 2301 +[default0]:Skipping sample id=1523045. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=979199. Maximum sequence length: 2049, sample length: 5632 +[default0]:Skipping sample id=880156. Maximum sequence length: 2049, sample length: 2920 +[default0]:Skipping sample id=37060. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=1231258. Maximum sequence length: 2049, sample length: 3096 +[default0]:Skipping sample id=854363. Maximum sequence length: 2049, sample length: 3488 +[default0]:Skipping sample id=1255728. Maximum sequence length: 2049, sample length: 2147 +[default0]:Skipping sample id=1040728. Maximum sequence length: 2049, sample length: 2069 +[default0]:Skipping sample id=1104237. Maximum sequence length: 2049, sample length: 3485 +[default0]:Skipping sample id=835363. Maximum sequence length: 2049, sample length: 3164 +[default0]:Skipping sample id=1198403. Maximum sequence length: 2049, sample length: 4930 +[default0]:Skipping sample id=1025868. Maximum sequence length: 2049, sample length: 4649 +[default0]:Skipping sample id=747341. Maximum sequence length: 2049, sample length: 2491 +[default0]:Skipping sample id=726148. Maximum sequence length: 2049, sample length: 3519 +[default0]:Skipping sample id=1474615. Maximum sequence length: 2049, sample length: 2408 +[default0]:Skipping sample id=215530. Maximum sequence length: 2049, sample length: 4326 +[default0]:Skipping sample id=919639. Maximum sequence length: 2049, sample length: 3011 +[default0]:Skipping sample id=245063. Maximum sequence length: 2049, sample length: 3482 +[default0]:Skipping sample id=581297. Maximum sequence length: 2049, sample length: 2540 +[default0]:Skipping sample id=612052. Maximum sequence length: 2049, sample length: 2241 +[default0]:Skipping sample id=407123. Maximum sequence length: 2049, sample length: 3847 +[default0]:Skipping sample id=692600. Maximum sequence length: 2049, sample length: 2983 +[default0]:Skipping sample id=1558837. Maximum sequence length: 2049, sample length: 2655 +[default0]:Skipping sample id=719523. Maximum sequence length: 2049, sample length: 3711 +[default0]:Skipping sample id=923933. Maximum sequence length: 2049, sample length: 2259 +[default0]:Skipping sample id=640789. Maximum sequence length: 2049, sample length: 3489 +[default0]:Skipping sample id=901516. Maximum sequence length: 2049, sample length: 2128 +[default0]:Skipping sample id=999735. Maximum sequence length: 2049, sample length: 4120 +[default0]:Skipping sample id=1489524. Maximum sequence length: 2049, sample length: 4526 +[default0]:Skipping sample id=994710. Maximum sequence length: 2049, sample length: 2890 +[default0]:Skipping sample id=841519. Maximum sequence length: 2049, sample length: 3501 +[default0]:Skipping sample id=825541. Maximum sequence length: 2049, sample length: 3090 +[default0]:Skipping sample id=555296. Maximum sequence length: 2049, sample length: 2703 +[default0]:Skipping sample id=542440. Maximum sequence length: 2049, sample length: 2379 +[default0]:Skipping sample id=1359585. Maximum sequence length: 2049, sample length: 2095 +[default0]:Skipping sample id=909685. Maximum sequence length: 2049, sample length: 3547 +[default0]:Skipping sample id=123250. Maximum sequence length: 2049, sample length: 3466 +[default0]:Skipping sample id=302032. Maximum sequence length: 2049, sample length: 2199 +[default0]:Skipping sample id=1412378. Maximum sequence length: 2049, sample length: 2644 +[default0]:Skipping sample id=1183302. Maximum sequence length: 2049, sample length: 3672 +[default0]:Skipping sample id=1198873. Maximum sequence length: 2049, sample length: 2119 +[default0]:Skipping sample id=953701. Maximum sequence length: 2049, sample length: 2735 +[default0]:Skipping sample id=747970. Maximum sequence length: 2049, sample length: 2096 +[default0]:Skipping sample id=1506868. Maximum sequence length: 2049, sample length: 4368 +[default0]:Skipping sample id=1291196. Maximum sequence length: 2049, sample length: 2632 +[default0]:Skipping sample id=375275. Maximum sequence length: 2049, sample length: 2474 +[default0]:Skipping sample id=384305. Maximum sequence length: 2049, sample length: 2132 +[default0]:Skipping sample id=477007. Maximum sequence length: 2049, sample length: 2357 +[default0]:Skipping sample id=449386. Maximum sequence length: 2049, sample length: 3268 +[default0]:Skipping sample id=69192. Maximum sequence length: 2049, sample length: 2396 +[default0]:Skipping sample id=47313. Maximum sequence length: 2049, sample length: 5031 +[default0]:Skipping sample id=720376. Maximum sequence length: 2049, sample length: 3304 +[default0]:Skipping sample id=663380. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=1289061. Maximum sequence length: 2049, sample length: 2097 +[default0]:Skipping sample id=853658. Maximum sequence length: 2049, sample length: 2691 +[default0]:Skipping sample id=701195. Maximum sequence length: 2049, sample length: 2580 +[default0]:Skipping sample id=1546054. Maximum sequence length: 2049, sample length: 2683 +[default0]:Skipping sample id=1250550. Maximum sequence length: 2049, sample length: 2059 +[default0]:Skipping sample id=1379978. Maximum sequence length: 2049, sample length: 3168 +[default0]:Skipping sample id=1140875. Maximum sequence length: 2049, sample length: 2882 +[default0]:Skipping sample id=1403901. Maximum sequence length: 2049, sample length: 3876 +[default0]:Skipping sample id=268785. Maximum sequence length: 2049, sample length: 3489 +[default0]:Skipping sample id=1500818. Maximum sequence length: 2049, sample length: 3616 +[default0]:Skipping sample id=393081. Maximum sequence length: 2049, sample length: 2454 +[default0]:Skipping sample id=1409339. Maximum sequence length: 2049, sample length: 4066 +[default0]:Skipping sample id=1001432. Maximum sequence length: 2049, sample length: 2409 +[default0]:Skipping sample id=1455139. Maximum sequence length: 2049, sample length: 2265 +[default0]:Skipping sample id=1358320. Maximum sequence length: 2049, sample length: 2300 +[default0]:Skipping sample id=168288. Maximum sequence length: 2049, sample length: 2927 +[default0]:Skipping sample id=1186888. Maximum sequence length: 2049, sample length: 2647 +[default0]:Skipping sample id=1431956. Maximum sequence length: 2049, sample length: 2115 +[default0]:Skipping sample id=103109. Maximum sequence length: 2049, sample length: 2285 +[default0]:Skipping sample id=23055. Maximum sequence length: 2049, sample length: 2283 +[default0]:Skipping sample id=1332680. Maximum sequence length: 2049, sample length: 2510 +[default0]:Skipping sample id=369384. Maximum sequence length: 2049, sample length: 2479 +[default0]:Skipping sample id=730345. Maximum sequence length: 2049, sample length: 2340 +[default0]:Skipping sample id=923981. Maximum sequence length: 2049, sample length: 2768 +[default0]:Skipping sample id=1445714. Maximum sequence length: 2049, sample length: 2887 +[default0]:Skipping sample id=971915. Maximum sequence length: 2049, sample length: 2280 +[default0]:Skipping sample id=291442. Maximum sequence length: 2049, sample length: 2337 +[default0]:Skipping sample id=1201950. Maximum sequence length: 2049, sample length: 2386 +[default0]:Skipping sample id=61027. Maximum sequence length: 2049, sample length: 2206 +[default0]:Skipping sample id=1090197. Maximum sequence length: 2049, sample length: 2637 +[default0]:Skipping sample id=17065. Maximum sequence length: 2049, sample length: 3557 +[default0]:Skipping sample id=771332. Maximum sequence length: 2049, sample length: 2473 +[default0]:Skipping sample id=431642. Maximum sequence length: 2049, sample length: 3846 +[default0]:Skipping sample id=1163874. Maximum sequence length: 2049, sample length: 2502 +[default0]:Skipping sample id=36658. Maximum sequence length: 2049, sample length: 2301 +[default0]:Skipping sample id=1507779. Maximum sequence length: 2049, sample length: 3414 +[default0]:Skipping sample id=1090226. Maximum sequence length: 2049, sample length: 2734 +[default0]:Skipping sample id=1495795. Maximum sequence length: 2049, sample length: 2723 +[default0]:Skipping sample id=106337. Maximum sequence length: 2049, sample length: 2118 +[default0]:Skipping sample id=316928. Maximum sequence length: 2049, sample length: 3259 +[default0]:Skipping sample id=609987. Maximum sequence length: 2049, sample length: 5364 +[default0]:Skipping sample id=534286. Maximum sequence length: 2049, sample length: 2405 +[default0]:Skipping sample id=799702. Maximum sequence length: 2049, sample length: 3019 +[default0]:Skipping sample id=347080. Maximum sequence length: 2049, sample length: 2488 +[default0]:Skipping sample id=724744. Maximum sequence length: 2049, sample length: 2556 +[default0]:Skipping sample id=1491617. Maximum sequence length: 2049, sample length: 5337 +[default0]:Skipping sample id=925298. Maximum sequence length: 2049, sample length: 2745 +[default0]:Skipping sample id=1518248. Maximum sequence length: 2049, sample length: 5528 +[default0]:Skipping sample id=204582. Maximum sequence length: 2049, sample length: 2294 +[default0]:Skipping sample id=154425. Maximum sequence length: 2049, sample length: 4020 +[default0]:Skipping sample id=1212140. Maximum sequence length: 2049, sample length: 2521 +[default0]:Skipping sample id=130061. Maximum sequence length: 2049, sample length: 2284 +[default0]:Skipping sample id=267641. Maximum sequence length: 2049, sample length: 3001 +[default0]:Skipping sample id=323466. Maximum sequence length: 2049, sample length: 2069 +[default0]:Skipping sample id=1341613. Maximum sequence length: 2049, sample length: 2159 +[default0]:Skipping sample id=170041. Maximum sequence length: 2049, sample length: 3262 +[default0]:Skipping sample id=1115566. Maximum sequence length: 2049, sample length: 4339 +[default0]:Skipping sample id=1534714. Maximum sequence length: 2049, sample length: 2497 +[default0]:Skipping sample id=420392. Maximum sequence length: 2049, sample length: 3410 +[default0]:Skipping sample id=433897. Maximum sequence length: 2049, sample length: 5678 +[default0]:Skipping sample id=1415515. Maximum sequence length: 2049, sample length: 3553 +[default0]:Skipping sample id=556395. Maximum sequence length: 2049, sample length: 2095 +[default0]:Skipping sample id=10953. Maximum sequence length: 2049, sample length: 2325 +[default0]:Skipping sample id=407091. Maximum sequence length: 2049, sample length: 3951 +[default0]:Skipping sample id=1499244. Maximum sequence length: 2049, sample length: 2447 +[default0]:Skipping sample id=1418684. Maximum sequence length: 2049, sample length: 2227 +[default0]:Skipping sample id=191084. Maximum sequence length: 2049, sample length: 2303 +[default0]:Skipping sample id=1024353. Maximum sequence length: 2049, sample length: 2124 +[default0]:Skipping sample id=1031875. Maximum sequence length: 2049, sample length: 2584 +[default0]:Skipping sample id=1555312. Maximum sequence length: 2049, sample length: 2209 +[default0]:Skipping sample id=79509. Maximum sequence length: 2049, sample length: 3021 +[default0]:Skipping sample id=1350103. Maximum sequence length: 2049, sample length: 3097 +[default0]:Skipping sample id=224775. Maximum sequence length: 2049, sample length: 3023 +[default0]:Skipping sample id=882397. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=388780. Maximum sequence length: 2049, sample length: 2153 +[default0]:Skipping sample id=1270327. Maximum sequence length: 2049, sample length: 4155 +[default0]:Skipping sample id=254135. Maximum sequence length: 2049, sample length: 2649 +[default0]:Skipping sample id=1372215. Maximum sequence length: 2049, sample length: 2294 +[default0]:Skipping sample id=182359. Maximum sequence length: 2049, sample length: 5167 +[default0]:Skipping sample id=888989. Maximum sequence length: 2049, sample length: 3206 +[default0]:Skipping sample id=610319. Maximum sequence length: 2049, sample length: 4233 +[default0]:Skipping sample id=1003993. Maximum sequence length: 2049, sample length: 2813 +[default0]:Skipping sample id=751101. Maximum sequence length: 2049, sample length: 2141 +[default0]:Skipping sample id=1329933. Maximum sequence length: 2049, sample length: 3264 +[default0]:Skipping sample id=1207373. Maximum sequence length: 2049, sample length: 3066 +[default0]:Skipping sample id=835082. Maximum sequence length: 2049, sample length: 4325 +[default0]:Skipping sample id=721980. Maximum sequence length: 2049, sample length: 2213 +[default0]:Skipping sample id=413609. Maximum sequence length: 2049, sample length: 4178 +[default0]:Skipping sample id=610512. Maximum sequence length: 2049, sample length: 2766 +[default0]:Skipping sample id=1088518. Maximum sequence length: 2049, sample length: 3471 +[default0]:Skipping sample id=657601. Maximum sequence length: 2049, sample length: 4474 +[default0]:Skipping sample id=1125206. Maximum sequence length: 2049, sample length: 3958 +[default0]:Skipping sample id=256031. Maximum sequence length: 2049, sample length: 3311 +[default0]:Skipping sample id=1297509. Maximum sequence length: 2049, sample length: 2326 +[default0]:Skipping sample id=198249. Maximum sequence length: 2049, sample length: 5161 +[default0]:Skipping sample id=979333. Maximum sequence length: 2049, sample length: 2847 +[default0]:Skipping sample id=1485828. Maximum sequence length: 2049, sample length: 2870 +[default0]:Skipping sample id=1499754. Maximum sequence length: 2049, sample length: 2085 +[default0]:Skipping sample id=1023459. Maximum sequence length: 2049, sample length: 2292 +[default0]:Skipping sample id=952273. Maximum sequence length: 2049, sample length: 2319 +[default0]:Skipping sample id=1392649. Maximum sequence length: 2049, sample length: 2915 +[default0]:Skipping sample id=1116502. Maximum sequence length: 2049, sample length: 3192 +[default0]:Skipping sample id=727744. Maximum sequence length: 2049, sample length: 3262 +[default0]:Skipping sample id=741214. Maximum sequence length: 2049, sample length: 2729 +[default0]:Skipping sample id=1039556. Maximum sequence length: 2049, sample length: 2675 +[default0]:Skipping sample id=216412. Maximum sequence length: 2049, sample length: 2547 +[default0]:Skipping sample id=690348. Maximum sequence length: 2049, sample length: 2186 +[default0]:Skipping sample id=611814. Maximum sequence length: 2049, sample length: 2506 +[default0]:Skipping sample id=1268236. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=385189. Maximum sequence length: 2049, sample length: 2286 +[default0]:Skipping sample id=102408. Maximum sequence length: 2049, sample length: 2760 +[default0]:Skipping sample id=48218. Maximum sequence length: 2049, sample length: 2296 +[default0]:Skipping sample id=1221973. Maximum sequence length: 2049, sample length: 3295 +[default0]:Skipping sample id=1192073. Maximum sequence length: 2049, sample length: 3424 +[default0]:Skipping sample id=549138. Maximum sequence length: 2049, sample length: 2381 +[default0]:Skipping sample id=1432397. Maximum sequence length: 2049, sample length: 2075 +[default0]:Skipping sample id=1146142. Maximum sequence length: 2049, sample length: 2784 +[default0]:Skipping sample id=1300659. Maximum sequence length: 2049, sample length: 2487 +[default0]:Skipping sample id=419767. Maximum sequence length: 2049, sample length: 3582 +[default0]:Skipping sample id=1185634. Maximum sequence length: 2049, sample length: 4892 +[default0]:Skipping sample id=1112462. Maximum sequence length: 2049, sample length: 2189 +[default0]:Skipping sample id=947386. Maximum sequence length: 2049, sample length: 2812 +[default0]:Skipping sample id=530347. Maximum sequence length: 2049, sample length: 2394 +[default0]:Skipping sample id=1435059. Maximum sequence length: 2049, sample length: 3521 +[default0]:Skipping sample id=679802. Maximum sequence length: 2049, sample length: 2516 +[default0]:Skipping sample id=95108. Maximum sequence length: 2049, sample length: 4254 +[default0]:Skipping sample id=621213. Maximum sequence length: 2049, sample length: 2577 +[default0]:Skipping sample id=601014. Maximum sequence length: 2049, sample length: 4176 +[default0]:Skipping sample id=904573. Maximum sequence length: 2049, sample length: 2317 +[default0]:Skipping sample id=42352. Maximum sequence length: 2049, sample length: 4167 +[default0]:Skipping sample id=781057. Maximum sequence length: 2049, sample length: 3797 +[default0]:Skipping sample id=1393529. Maximum sequence length: 2049, sample length: 2448 +[default0]:Skipping sample id=89871. Maximum sequence length: 2049, sample length: 2950 +[default0]:Skipping sample id=195001. Maximum sequence length: 2049, sample length: 2104 +[default0]:Skipping sample id=357414. Maximum sequence length: 2049, sample length: 2135 +[default0]:Skipping sample id=1259289. Maximum sequence length: 2049, sample length: 3021 +[default0]:Skipping sample id=1010411. Maximum sequence length: 2049, sample length: 3995 +[default0]:Skipping sample id=204606. Maximum sequence length: 2049, sample length: 3370 +[default0]:Skipping sample id=1068629. Maximum sequence length: 2049, sample length: 2210 +[default0]:Skipping sample id=951356. Maximum sequence length: 2049, sample length: 3351 +[default0]:Skipping sample id=1039872. Maximum sequence length: 2049, sample length: 2481 +[default0]:Skipping sample id=991468. Maximum sequence length: 2049, sample length: 4548 +[default0]:Skipping sample id=1435214. Maximum sequence length: 2049, sample length: 3784 +[default0]:Skipping sample id=463285. Maximum sequence length: 2049, sample length: 2342 +[default0]:Skipping sample id=174021. Maximum sequence length: 2049, sample length: 2360 +[default0]:Skipping sample id=1130845. Maximum sequence length: 2049, sample length: 3024 +[default0]:Skipping sample id=1243610. Maximum sequence length: 2049, sample length: 3203 +[default0]:Skipping sample id=1216582. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=865894. Maximum sequence length: 2049, sample length: 3801 +[default0]:Skipping sample id=198692. Maximum sequence length: 2049, sample length: 2161 +[default0]:Skipping sample id=117294. Maximum sequence length: 2049, sample length: 2173 +[default0]:Skipping sample id=1349403. Maximum sequence length: 2049, sample length: 2146 +[default0]:Skipping sample id=392072. Maximum sequence length: 2049, sample length: 2430 +[default0]:Skipping sample id=1547910. Maximum sequence length: 2049, sample length: 3570 +[default0]:Skipping sample id=1345484. Maximum sequence length: 2049, sample length: 2089 +[default0]:Skipping sample id=181739. Maximum sequence length: 2049, sample length: 2361 +[default0]:Skipping sample id=678405. Maximum sequence length: 2049, sample length: 3277 +[default0]:Skipping sample id=468453. Maximum sequence length: 2049, sample length: 2284 +[default0]:Skipping sample id=37298. Maximum sequence length: 2049, sample length: 2918 +[default0]:Skipping sample id=879570. Maximum sequence length: 2049, sample length: 2436 +[default0]:Skipping sample id=526518. Maximum sequence length: 2049, sample length: 2090 +[default0]:Skipping sample id=251660. Maximum sequence length: 2049, sample length: 2067 +[default0]:Skipping sample id=960744. Maximum sequence length: 2049, sample length: 2782 +[default0]:Skipping sample id=1318112. Maximum sequence length: 2049, sample length: 2369 +[default0]:Skipping sample id=773062. Maximum sequence length: 2049, sample length: 2300 +[default0]:Skipping sample id=767815. Maximum sequence length: 2049, sample length: 2232 +[default0]:Skipping sample id=975933. Maximum sequence length: 2049, sample length: 3327 +[default0]:Skipping sample id=170099. Maximum sequence length: 2049, sample length: 2440 +[default0]:Skipping sample id=740574. Maximum sequence length: 2049, sample length: 2784 +[default0]:Skipping sample id=500206. Maximum sequence length: 2049, sample length: 2564 +[default0]:Skipping sample id=459733. Maximum sequence length: 2049, sample length: 2495 +[default0]:Skipping sample id=270188. Maximum sequence length: 2049, sample length: 2164 +[default0]:Skipping sample id=226600. Maximum sequence length: 2049, sample length: 3592 +[default0]:Skipping sample id=868394. Maximum sequence length: 2049, sample length: 2488 +[default0]:Skipping sample id=1162633. Maximum sequence length: 2049, sample length: 2090 +[default0]:Skipping sample id=1031333. Maximum sequence length: 2049, sample length: 2609 +[default0]:Skipping sample id=30229. Maximum sequence length: 2049, sample length: 2373 +[default0]:Skipping sample id=779147. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=230231. Maximum sequence length: 2049, sample length: 3173 +[default0]:Skipping sample id=571960. Maximum sequence length: 2049, sample length: 4118 +[default0]:Skipping sample id=1158800. Maximum sequence length: 2049, sample length: 3328 +[default0]:Skipping sample id=63849. Maximum sequence length: 2049, sample length: 2075 +[default0]:Skipping sample id=1444664. Maximum sequence length: 2049, sample length: 2749 +[default0]:Skipping sample id=7990. Maximum sequence length: 2049, sample length: 3399 +[default0]:Skipping sample id=67235. Maximum sequence length: 2049, sample length: 2140 +[default0]:Skipping sample id=414896. Maximum sequence length: 2049, sample length: 4754 +[default0]:Skipping sample id=1275857. Maximum sequence length: 2049, sample length: 2318 +[default0]:Skipping sample id=450581. Maximum sequence length: 2049, sample length: 2673 +[default0]:Skipping sample id=262426. Maximum sequence length: 2049, sample length: 2225 +[default0]:Skipping sample id=1165094. Maximum sequence length: 2049, sample length: 3773 +[default0]:Skipping sample id=634190. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=624411. Maximum sequence length: 2049, sample length: 3262 +[default0]:Skipping sample id=68724. Maximum sequence length: 2049, sample length: 2989 +[default0]:Skipping sample id=559570. Maximum sequence length: 2049, sample length: 2924 +[default0]:Skipping sample id=202604. Maximum sequence length: 2049, sample length: 2805 +[default0]:Skipping sample id=1389699. Maximum sequence length: 2049, sample length: 3124 +[default0]:Skipping sample id=1550055. Maximum sequence length: 2049, sample length: 4284 +[default0]:Skipping sample id=1339383. Maximum sequence length: 2049, sample length: 5453 +[default0]:Skipping sample id=1381065. Maximum sequence length: 2049, sample length: 6010 +[default0]:Skipping sample id=1138194. Maximum sequence length: 2049, sample length: 3531 +[default0]:Skipping sample id=752355. Maximum sequence length: 2049, sample length: 3784 +[default0]:Skipping sample id=243016. Maximum sequence length: 2049, sample length: 4168 +[default0]:Skipping sample id=330483. Maximum sequence length: 2049, sample length: 2183 +[default0]:Skipping sample id=127032. Maximum sequence length: 2049, sample length: 2411 +[default0]:Skipping sample id=718680. Maximum sequence length: 2049, sample length: 3253 +[default0]:Skipping sample id=943377. Maximum sequence length: 2049, sample length: 2985 +[default0]:Skipping sample id=310959. Maximum sequence length: 2049, sample length: 3966 +[default0]:Skipping sample id=965248. Maximum sequence length: 2049, sample length: 2335 +[default0]:Skipping sample id=1077852. Maximum sequence length: 2049, sample length: 2285 +[default0]:Skipping sample id=1432578. Maximum sequence length: 2049, sample length: 3815 +[default0]:Skipping sample id=414178. Maximum sequence length: 2049, sample length: 3687 +[default0]:Skipping sample id=1211517. Maximum sequence length: 2049, sample length: 2465 +[default0]:Skipping sample id=585644. Maximum sequence length: 2049, sample length: 2706 +[default0]:Skipping sample id=477085. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=624493. Maximum sequence length: 2049, sample length: 2322 +[default0]:Skipping sample id=618513. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=369919. Maximum sequence length: 2049, sample length: 3312 +[default0]:Skipping sample id=994616. Maximum sequence length: 2049, sample length: 2836 +[default0]:Skipping sample id=254402. Maximum sequence length: 2049, sample length: 2142 +[default0]:Skipping sample id=1039296. Maximum sequence length: 2049, sample length: 3468 +[default0]:Skipping sample id=931606. Maximum sequence length: 2049, sample length: 2092 +[default0]:Skipping sample id=580503. Maximum sequence length: 2049, sample length: 2528 +[default0]:Skipping sample id=26982. Maximum sequence length: 2049, sample length: 2099 +[default0]:Skipping sample id=1558145. Maximum sequence length: 2049, sample length: 2173 +[default0]:Skipping sample id=829188. Maximum sequence length: 2049, sample length: 2377 +[default0]:Skipping sample id=1562772. Maximum sequence length: 2049, sample length: 3523 +[default0]:Skipping sample id=1261313. Maximum sequence length: 2049, sample length: 5398 +[default0]:Skipping sample id=481481. Maximum sequence length: 2049, sample length: 2475 +[default0]:Skipping sample id=233766. Maximum sequence length: 2049, sample length: 2413 +[default0]:Skipping sample id=1384505. Maximum sequence length: 2049, sample length: 2280 +[default0]:Skipping sample id=481344. Maximum sequence length: 2049, sample length: 3220 +[default0]:Skipping sample id=960694. Maximum sequence length: 2049, sample length: 2139 +[default0]:Skipping sample id=1154754. Maximum sequence length: 2049, sample length: 2091 +[default0]:Skipping sample id=1278990. Maximum sequence length: 2049, sample length: 3084 +[default0]:Skipping sample id=1215536. Maximum sequence length: 2049, sample length: 3772 +[default0]:Skipping sample id=717776. Maximum sequence length: 2049, sample length: 2103 +[default0]:Skipping sample id=595023. Maximum sequence length: 2049, sample length: 3528 +[default0]:Skipping sample id=1347607. Maximum sequence length: 2049, sample length: 3264 +[default0]:Skipping sample id=911969. Maximum sequence length: 2049, sample length: 2580 +[default0]:Skipping sample id=1476502. Maximum sequence length: 2049, sample length: 2316 +[default0]:Skipping sample id=272312. Maximum sequence length: 2049, sample length: 2092 +[default0]:Skipping sample id=714774. Maximum sequence length: 2049, sample length: 5117 +[default0]:Skipping sample id=726447. Maximum sequence length: 2049, sample length: 4128 +[default0]:Skipping sample id=1079916. Maximum sequence length: 2049, sample length: 3988 +[default0]:Skipping sample id=1387878. Maximum sequence length: 2049, sample length: 3226 +[default0]:Skipping sample id=1329918. Maximum sequence length: 2049, sample length: 3697 +[default0]:Skipping sample id=1053131. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=177549. Maximum sequence length: 2049, sample length: 2735 +[default0]:Skipping sample id=1344311. Maximum sequence length: 2049, sample length: 2662 +[default0]:Skipping sample id=719505. Maximum sequence length: 2049, sample length: 2233 +[default0]:Skipping sample id=1040440. Maximum sequence length: 2049, sample length: 2081 +[default0]:Skipping sample id=220755. Maximum sequence length: 2049, sample length: 2649 +[default0]:Skipping sample id=263772. Maximum sequence length: 2049, sample length: 2232 +[default0]:Skipping sample id=1561867. Maximum sequence length: 2049, sample length: 2114 +[default0]:Skipping sample id=1486204. Maximum sequence length: 2049, sample length: 2958 +[default0]:Skipping sample id=1475603. Maximum sequence length: 2049, sample length: 2125 +[default0]:Skipping sample id=615429. Maximum sequence length: 2049, sample length: 2155 +[default0]:Skipping sample id=462511. Maximum sequence length: 2049, sample length: 2613 +[default0]:Skipping sample id=1112506. Maximum sequence length: 2049, sample length: 2491 +[default0]:Skipping sample id=218191. Maximum sequence length: 2049, sample length: 2390 +[default0]:Skipping sample id=206934. Maximum sequence length: 2049, sample length: 2121 +[default0]:Skipping sample id=104560. Maximum sequence length: 2049, sample length: 2586 +[default0]:Skipping sample id=856604. Maximum sequence length: 2049, sample length: 2795 +[default0]:Skipping sample id=1421537. Maximum sequence length: 2049, sample length: 3870 +[default0]:Skipping sample id=1552812. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=1344707. Maximum sequence length: 2049, sample length: 2135 +[default0]:Skipping sample id=1571071. Maximum sequence length: 2049, sample length: 2229 +[default0]:Skipping sample id=523151. Maximum sequence length: 2049, sample length: 3920 +[default0]:Skipping sample id=1297844. Maximum sequence length: 2049, sample length: 2852 +[default0]:Skipping sample id=57341. Maximum sequence length: 2049, sample length: 3005 +[default0]:Skipping sample id=762848. Maximum sequence length: 2049, sample length: 2521 +[default0]:Skipping sample id=1190133. Maximum sequence length: 2049, sample length: 2290 +[default0]:Skipping sample id=1300171. Maximum sequence length: 2049, sample length: 5659 +[default0]:Skipping sample id=1042460. Maximum sequence length: 2049, sample length: 2243 +[default0]:Skipping sample id=27524. Maximum sequence length: 2049, sample length: 2671 +[default0]:Skipping sample id=967712. Maximum sequence length: 2049, sample length: 2337 +[default0]:Skipping sample id=444237. Maximum sequence length: 2049, sample length: 2481 +[default0]:Skipping sample id=978032. Maximum sequence length: 2049, sample length: 4563 +[default0]:Skipping sample id=563905. Maximum sequence length: 2049, sample length: 4841 +[default0]:Skipping sample id=1297765. Maximum sequence length: 2049, sample length: 3538 +[default0]:Skipping sample id=711197. Maximum sequence length: 2049, sample length: 2618 +[default0]:Skipping sample id=162794. Maximum sequence length: 2049, sample length: 5987 +[default0]:Skipping sample id=458739. Maximum sequence length: 2049, sample length: 3955 +[default0]:Skipping sample id=553995. Maximum sequence length: 2049, sample length: 2875 +[default0]:Skipping sample id=1214559. Maximum sequence length: 2049, sample length: 2776 +[default0]:Skipping sample id=279706. Maximum sequence length: 2049, sample length: 2131 +[default0]:Skipping sample id=476573. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=1470833. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=645053. Maximum sequence length: 2049, sample length: 2585 +[default0]:Skipping sample id=244429. Maximum sequence length: 2049, sample length: 3485 +[default0]:Skipping sample id=277661. Maximum sequence length: 2049, sample length: 3628 +[default0]:Skipping sample id=1175020. Maximum sequence length: 2049, sample length: 5454 +[default0]:Skipping sample id=1322071. Maximum sequence length: 2049, sample length: 3235 +[default0]:Skipping sample id=176250. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=288450. Maximum sequence length: 2049, sample length: 2389 +[default0]:Skipping sample id=1530250. Maximum sequence length: 2049, sample length: 4754 +[default0]:Skipping sample id=20684. Maximum sequence length: 2049, sample length: 2527 +[default0]:Skipping sample id=688169. Maximum sequence length: 2049, sample length: 2355 +[default0]:Skipping sample id=311519. Maximum sequence length: 2049, sample length: 2386 +[default0]:Skipping sample id=1556127. Maximum sequence length: 2049, sample length: 2308 +[default0]:Skipping sample id=248539. Maximum sequence length: 2049, sample length: 2220 +[default0]:Skipping sample id=1408508. Maximum sequence length: 2049, sample length: 2509 +[default0]:Skipping sample id=1338226. Maximum sequence length: 2049, sample length: 2479 +[default0]:Skipping sample id=542821. Maximum sequence length: 2049, sample length: 3443 +[default0]:Skipping sample id=1517483. Maximum sequence length: 2049, sample length: 2348 +[default0]:Skipping sample id=548748. Maximum sequence length: 2049, sample length: 2265 +[default0]:Skipping sample id=679712. Maximum sequence length: 2049, sample length: 2729 +[default0]:Skipping sample id=1498425. Maximum sequence length: 2049, sample length: 5238 +[default0]:Skipping sample id=1294689. Maximum sequence length: 2049, sample length: 2079 +[default0]:Skipping sample id=1031992. Maximum sequence length: 2049, sample length: 2609 +[default0]:Skipping sample id=528213. Maximum sequence length: 2049, sample length: 2524 +[default0]:Skipping sample id=626583. Maximum sequence length: 2049, sample length: 3675 +[default0]:Skipping sample id=1055851. Maximum sequence length: 2049, sample length: 2185 +[default0]:Skipping sample id=846514. Maximum sequence length: 2049, sample length: 2586 +[default0]:Skipping sample id=1389829. Maximum sequence length: 2049, sample length: 2309 +[default0]:Skipping sample id=699479. Maximum sequence length: 2049, sample length: 2523 +[default0]:Skipping sample id=1230248. Maximum sequence length: 2049, sample length: 2308 +[default0]:Skipping sample id=1120259. Maximum sequence length: 2049, sample length: 3721 +[default0]:Skipping sample id=771683. Maximum sequence length: 2049, sample length: 3326 +[default0]:Skipping sample id=1033816. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=1240903. Maximum sequence length: 2049, sample length: 3667 +[default0]:Skipping sample id=1127720. Maximum sequence length: 2049, sample length: 2814 +[default0]:Skipping sample id=890963. Maximum sequence length: 2049, sample length: 3043 +[default0]:Skipping sample id=72424. Maximum sequence length: 2049, sample length: 3706 +[default0]:Skipping sample id=1149589. Maximum sequence length: 2049, sample length: 2378 +[default0]:Skipping sample id=855455. Maximum sequence length: 2049, sample length: 3850 +[default0]:Skipping sample id=1383965. Maximum sequence length: 2049, sample length: 2072 +[default0]:Skipping sample id=145533. Maximum sequence length: 2049, sample length: 4685 +[default0]:Skipping sample id=1339385. Maximum sequence length: 2049, sample length: 4870 +[default0]:Skipping sample id=737172. Maximum sequence length: 2049, sample length: 3365 +[default0]:Skipping sample id=940695. Maximum sequence length: 2049, sample length: 2163 +[default0]:Skipping sample id=1456258. Maximum sequence length: 2049, sample length: 3462 +[default0]:Skipping sample id=197397. Maximum sequence length: 2049, sample length: 2923 +[default0]:Skipping sample id=1208450. Maximum sequence length: 2049, sample length: 2140 +[default0]:Skipping sample id=840517. Maximum sequence length: 2049, sample length: 2245 +[default0]:Skipping sample id=1117973. Maximum sequence length: 2049, sample length: 2346 +[default0]:Skipping sample id=349287. Maximum sequence length: 2049, sample length: 6318 +[default0]:Skipping sample id=244464. Maximum sequence length: 2049, sample length: 2648 +[default0]:Skipping sample id=800682. Maximum sequence length: 2049, sample length: 2238 +[default0]:Skipping sample id=714888. Maximum sequence length: 2049, sample length: 3298 +[default0]:Skipping sample id=1348659. Maximum sequence length: 2049, sample length: 2539 +[default0]:Skipping sample id=1359813. Maximum sequence length: 2049, sample length: 4136 +[default0]:Skipping sample id=1252983. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=443792. Maximum sequence length: 2049, sample length: 2318 +[default0]:Skipping sample id=750204. Maximum sequence length: 2049, sample length: 8992 +[default0]:Skipping sample id=814447. Maximum sequence length: 2049, sample length: 2795 +[default0]:Skipping sample id=1365871. Maximum sequence length: 2049, sample length: 2776 +[default0]:Skipping sample id=955541. Maximum sequence length: 2049, sample length: 3278 +[default0]:Skipping sample id=1047044. Maximum sequence length: 2049, sample length: 2822 +[default0]:Skipping sample id=1153686. Maximum sequence length: 2049, sample length: 2273 +[default0]:Skipping sample id=114733. Maximum sequence length: 2049, sample length: 2090 +[default0]:Skipping sample id=129927. Maximum sequence length: 2049, sample length: 3669 +[default0]:Skipping sample id=1494825. Maximum sequence length: 2049, sample length: 3534 +[default0]:Skipping sample id=593114. Maximum sequence length: 2049, sample length: 2604 +[default0]:Skipping sample id=596706. Maximum sequence length: 2049, sample length: 5664 +[default0]:Skipping sample id=1142813. Maximum sequence length: 2049, sample length: 3684 +[default0]:Skipping sample id=1507232. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=1454064. Maximum sequence length: 2049, sample length: 5141 +[default0]:Skipping sample id=697238. Maximum sequence length: 2049, sample length: 2366 +[default0]:Skipping sample id=561609. Maximum sequence length: 2049, sample length: 2121 +[default0]:Skipping sample id=1152951. Maximum sequence length: 2049, sample length: 2160 +[default0]:Skipping sample id=459905. Maximum sequence length: 2049, sample length: 2591 +[default0]:Skipping sample id=571996. Maximum sequence length: 2049, sample length: 2633 +[default0]:Skipping sample id=259940. Maximum sequence length: 2049, sample length: 3768 +[default0]:Skipping sample id=498508. Maximum sequence length: 2049, sample length: 3046 +[default0]:Skipping sample id=85417. Maximum sequence length: 2049, sample length: 2373 +[default0]:Skipping sample id=161853. Maximum sequence length: 2049, sample length: 3128 +[default0]:Skipping sample id=1374653. Maximum sequence length: 2049, sample length: 2690 +[default0]:Skipping sample id=1251854. Maximum sequence length: 2049, sample length: 2574 +[default0]:Skipping sample id=373196. Maximum sequence length: 2049, sample length: 2735 +[default0]:Skipping sample id=669285. Maximum sequence length: 2049, sample length: 2347 +[default0]:Skipping sample id=383690. Maximum sequence length: 2049, sample length: 2147 +[default0]:Skipping sample id=544268. Maximum sequence length: 2049, sample length: 7248 +[default0]:Skipping sample id=237994. Maximum sequence length: 2049, sample length: 5349 +[default0]:Skipping sample id=1439745. Maximum sequence length: 2049, sample length: 2057 +[default0]:Skipping sample id=1516873. Maximum sequence length: 2049, sample length: 2792 +[default0]:Skipping sample id=301321. Maximum sequence length: 2049, sample length: 2173 +[default0]:Skipping sample id=901894. Maximum sequence length: 2049, sample length: 3301 +[default0]:Skipping sample id=959275. Maximum sequence length: 2049, sample length: 4217 +[default0]:Skipping sample id=1499117. Maximum sequence length: 2049, sample length: 3226 +[default0]:Skipping sample id=1556423. Maximum sequence length: 2049, sample length: 2052 +[default0]:Skipping sample id=477296. Maximum sequence length: 2049, sample length: 2211 +[default0]:Skipping sample id=201243. Maximum sequence length: 2049, sample length: 2644 +[default0]:Skipping sample id=1156065. Maximum sequence length: 2049, sample length: 3122 +[default0]:Skipping sample id=289769. Maximum sequence length: 2049, sample length: 2948 +[default0]:Skipping sample id=303501. Maximum sequence length: 2049, sample length: 2466 +[default0]:Skipping sample id=460357. Maximum sequence length: 2049, sample length: 2193 +[default0]:Skipping sample id=51632. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=1023832. Maximum sequence length: 2049, sample length: 2381 +[default0]:Skipping sample id=171373. Maximum sequence length: 2049, sample length: 2418 +[default0]:Skipping sample id=949451. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=846951. Maximum sequence length: 2049, sample length: 4421 +[default0]:Skipping sample id=1258757. Maximum sequence length: 2049, sample length: 4681 +[default0]:Skipping sample id=384902. Maximum sequence length: 2049, sample length: 4343 +[default0]:Skipping sample id=838486. Maximum sequence length: 2049, sample length: 3235 +[default0]:Skipping sample id=827189. Maximum sequence length: 2049, sample length: 2143 +[default0]:Skipping sample id=251330. Maximum sequence length: 2049, sample length: 2316 +[default0]:Skipping sample id=1270848. Maximum sequence length: 2049, sample length: 2510 +[default0]:Skipping sample id=325125. Maximum sequence length: 2049, sample length: 2602 +[default0]:Skipping sample id=726568. Maximum sequence length: 2049, sample length: 2657 +[default0]:Skipping sample id=11084. Maximum sequence length: 2049, sample length: 3136 +[default0]:Skipping sample id=264420. Maximum sequence length: 2049, sample length: 3045 +[default0]:Skipping sample id=862677. Maximum sequence length: 2049, sample length: 2337 +[default0]:Skipping sample id=165543. Maximum sequence length: 2049, sample length: 4576 +[default0]:Skipping sample id=859015. Maximum sequence length: 2049, sample length: 2149 +[default0]:Skipping sample id=620074. Maximum sequence length: 2049, sample length: 5191 +[default0]:Skipping sample id=283871. Maximum sequence length: 2049, sample length: 3177 +[default0]:Skipping sample id=309059. Maximum sequence length: 2049, sample length: 2843 +[default0]:Skipping sample id=412621. Maximum sequence length: 2049, sample length: 3297 +[default0]:Skipping sample id=1032306. Maximum sequence length: 2049, sample length: 2643 +[default0]:Skipping sample id=139664. Maximum sequence length: 2049, sample length: 2388 +[default0]:Skipping sample id=852585. Maximum sequence length: 2049, sample length: 2378 +[default0]:Skipping sample id=916868. Maximum sequence length: 2049, sample length: 3436 +[default0]:Skipping sample id=1534489. Maximum sequence length: 2049, sample length: 2331 +[default0]:Skipping sample id=1187872. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=699545. Maximum sequence length: 2049, sample length: 6160 +[default0]:Skipping sample id=571110. Maximum sequence length: 2049, sample length: 2927 +[default0]:Skipping sample id=648219. Maximum sequence length: 2049, sample length: 3200 +[default0]:Skipping sample id=309593. Maximum sequence length: 2049, sample length: 2513 +[default0]:Skipping sample id=1093476. Maximum sequence length: 2049, sample length: 2374 +[default0]:Skipping sample id=1025731. Maximum sequence length: 2049, sample length: 2462 +[default0]:Skipping sample id=547431. Maximum sequence length: 2049, sample length: 2163 +[default0]:Skipping sample id=284673. Maximum sequence length: 2049, sample length: 2352 +[default0]:Skipping sample id=1002841. Maximum sequence length: 2049, sample length: 2314 +[default0]:Skipping sample id=61083. Maximum sequence length: 2049, sample length: 3146 +[default0]:Skipping sample id=1012004. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=948574. Maximum sequence length: 2049, sample length: 2768 +[default0]:Skipping sample id=1300566. Maximum sequence length: 2049, sample length: 3364 +[default0]:Skipping sample id=1014868. Maximum sequence length: 2049, sample length: 2560 +[default0]:Skipping sample id=1520332. Maximum sequence length: 2049, sample length: 2075 +[default0]:Skipping sample id=1414380. Maximum sequence length: 2049, sample length: 2531 +[default0]:Skipping sample id=29106. Maximum sequence length: 2049, sample length: 2144 +[default0]:Skipping sample id=1063631. Maximum sequence length: 2049, sample length: 3853 +[default0]:Skipping sample id=1358451. Maximum sequence length: 2049, sample length: 2189 +[default0]:Skipping sample id=119231. Maximum sequence length: 2049, sample length: 2648 +[default0]:Skipping sample id=459553. Maximum sequence length: 2049, sample length: 5130 +[default0]:Skipping sample id=1515627. Maximum sequence length: 2049, sample length: 2341 +[default0]:Skipping sample id=975325. Maximum sequence length: 2049, sample length: 2105 +[default0]:Skipping sample id=516507. Maximum sequence length: 2049, sample length: 2389 +[default0]:Skipping sample id=1421703. Maximum sequence length: 2049, sample length: 5508 +[default0]:Skipping sample id=1325435. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=19359. Maximum sequence length: 2049, sample length: 3161 +[default0]:Skipping sample id=996965. Maximum sequence length: 2049, sample length: 2593 +[default0]:Skipping sample id=1227389. Maximum sequence length: 2049, sample length: 3446 +[default0]:Skipping sample id=678569. Maximum sequence length: 2049, sample length: 3440 +[default0]:Skipping sample id=155145. Maximum sequence length: 2049, sample length: 5473 +[default0]:Skipping sample id=659926. Maximum sequence length: 2049, sample length: 2155 +[default0]:Skipping sample id=1441594. Maximum sequence length: 2049, sample length: 3694 +[default0]:Skipping sample id=249801. Maximum sequence length: 2049, sample length: 2134 +[default0]:Skipping sample id=995959. Maximum sequence length: 2049, sample length: 3759 +[default0]:Skipping sample id=448867. Maximum sequence length: 2049, sample length: 2603 +[default0]:Skipping sample id=1239915. Maximum sequence length: 2049, sample length: 4508 +[default0]:Skipping sample id=1056177. Maximum sequence length: 2049, sample length: 2808 +[default0]:Skipping sample id=512128. Maximum sequence length: 2049, sample length: 2915 +[default0]:Skipping sample id=1325012. Maximum sequence length: 2049, sample length: 3256 +[default0]:Skipping sample id=383492. Maximum sequence length: 2049, sample length: 3536 +[default0]:Skipping sample id=448629. Maximum sequence length: 2049, sample length: 5425 +[default0]:Skipping sample id=178694. Maximum sequence length: 2049, sample length: 4116 +[default0]:Skipping sample id=22311. Maximum sequence length: 2049, sample length: 2517 +[default0]:Skipping sample id=1466200. Maximum sequence length: 2049, sample length: 2975 +[default0]:Skipping sample id=780760. Maximum sequence length: 2049, sample length: 4003 +[default0]:Skipping sample id=1328352. Maximum sequence length: 2049, sample length: 2233 +[default0]:Skipping sample id=535701. Maximum sequence length: 2049, sample length: 3003 +[default0]:Skipping sample id=906069. Maximum sequence length: 2049, sample length: 2512 +[default0]:Skipping sample id=636722. Maximum sequence length: 2049, sample length: 2347 +[default0]:Skipping sample id=1158359. Maximum sequence length: 2049, sample length: 2119 +[default0]:Skipping sample id=1296900. Maximum sequence length: 2049, sample length: 2312 +[default0]:Skipping sample id=183679. Maximum sequence length: 2049, sample length: 4320 +[default0]:Skipping sample id=451458. Maximum sequence length: 2049, sample length: 2314 +[default0]:Skipping sample id=3180. Maximum sequence length: 2049, sample length: 2573 +[default0]:Skipping sample id=640060. Maximum sequence length: 2049, sample length: 4033 +[default0]:Skipping sample id=691153. Maximum sequence length: 2049, sample length: 2632 +[default0]:Skipping sample id=594703. Maximum sequence length: 2049, sample length: 2325 +[default0]:Skipping sample id=1121207. Maximum sequence length: 2049, sample length: 2458 +[default0]:Skipping sample id=1054580. Maximum sequence length: 2049, sample length: 2478 +[default0]:Skipping sample id=1218800. Maximum sequence length: 2049, sample length: 3885 +[default0]:Skipping sample id=625787. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=355190. Maximum sequence length: 2049, sample length: 4586 +[default0]:Skipping sample id=1216140. Maximum sequence length: 2049, sample length: 4273 +[default0]:Skipping sample id=239391. Maximum sequence length: 2049, sample length: 2893 +[default0]:Skipping sample id=329061. Maximum sequence length: 2049, sample length: 2074 +[default0]:Skipping sample id=704573. Maximum sequence length: 2049, sample length: 3266 +[default0]:Skipping sample id=1401786. Maximum sequence length: 2049, sample length: 2957 +[default0]:Skipping sample id=1281620. Maximum sequence length: 2049, sample length: 2843 +[default0]:Skipping sample id=424944. Maximum sequence length: 2049, sample length: 3596 +[default0]:Skipping sample id=676590. Maximum sequence length: 2049, sample length: 2571 +[default0]:Skipping sample id=209262. Maximum sequence length: 2049, sample length: 4126 +[default0]:Skipping sample id=482938. Maximum sequence length: 2049, sample length: 3815 +[default0]:Skipping sample id=8415. Maximum sequence length: 2049, sample length: 2362 +[default0]:Skipping sample id=1400598. Maximum sequence length: 2049, sample length: 2061 +[default0]:Skipping sample id=524086. Maximum sequence length: 2049, sample length: 2512 +[default0]:Skipping sample id=893017. Maximum sequence length: 2049, sample length: 2823 +[default0]:Skipping sample id=1467201. Maximum sequence length: 2049, sample length: 2761 +[default0]:Skipping sample id=323768. Maximum sequence length: 2049, sample length: 2621 +[default0]:Skipping sample id=1197926. Maximum sequence length: 2049, sample length: 3325 +[default0]:Skipping sample id=448454. Maximum sequence length: 2049, sample length: 4769 +[default0]:Skipping sample id=996146. Maximum sequence length: 2049, sample length: 8043 +[default0]:Skipping sample id=372790. Maximum sequence length: 2049, sample length: 2421 +[default0]:Skipping sample id=463037. Maximum sequence length: 2049, sample length: 3350 +[default0]:Skipping sample id=1496242. Maximum sequence length: 2049, sample length: 2411 +[default0]:Skipping sample id=784771. Maximum sequence length: 2049, sample length: 3618 +[default0]:Skipping sample id=710417. Maximum sequence length: 2049, sample length: 2227 +[default0]:Skipping sample id=889085. Maximum sequence length: 2049, sample length: 2199 +[default0]:Skipping sample id=219873. Maximum sequence length: 2049, sample length: 2714 +[default0]:Skipping sample id=1425906. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=1013612. Maximum sequence length: 2049, sample length: 3160 +[default0]:Skipping sample id=1150450. Maximum sequence length: 2049, sample length: 2955 +[default0]:Skipping sample id=926349. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=162249. Maximum sequence length: 2049, sample length: 6492 +[default0]:Skipping sample id=1308791. Maximum sequence length: 2049, sample length: 4303 +[default0]:Skipping sample id=445755. Maximum sequence length: 2049, sample length: 2829 +[default0]:Skipping sample id=1147040. Maximum sequence length: 2049, sample length: 2564 +[default0]:Skipping sample id=985528. Maximum sequence length: 2049, sample length: 2285 +[default0]:Skipping sample id=7271. Maximum sequence length: 2049, sample length: 2317 +[default0]:Skipping sample id=841022. Maximum sequence length: 2049, sample length: 3285 +[default0]:Skipping sample id=19025. Maximum sequence length: 2049, sample length: 3490 +[default0]:Skipping sample id=871169. Maximum sequence length: 2049, sample length: 2136 +[default0]:Skipping sample id=623675. Maximum sequence length: 2049, sample length: 2506 +[default0]:Skipping sample id=730149. Maximum sequence length: 2049, sample length: 2450 +[default0]:Skipping sample id=1556655. Maximum sequence length: 2049, sample length: 2495 +[default0]:Skipping sample id=1525168. Maximum sequence length: 2049, sample length: 2767 +[default0]:Skipping sample id=1337184. Maximum sequence length: 2049, sample length: 3412 +[default0]:Skipping sample id=1243723. Maximum sequence length: 2049, sample length: 2296 +[default0]:Skipping sample id=1510319. Maximum sequence length: 2049, sample length: 3603 +[default0]:Skipping sample id=734207. Maximum sequence length: 2049, sample length: 2577 +[default0]:Skipping sample id=1477601. Maximum sequence length: 2049, sample length: 3657 +[default0]:Skipping sample id=1280556. Maximum sequence length: 2049, sample length: 2939 +[default0]:Skipping sample id=743895. Maximum sequence length: 2049, sample length: 2830 +[default0]:Skipping sample id=1148635. Maximum sequence length: 2049, sample length: 2575 +[default0]:Skipping sample id=1531320. Maximum sequence length: 2049, sample length: 2669 +[default0]:Skipping sample id=485758. Maximum sequence length: 2049, sample length: 3792 +[default0]:Skipping sample id=723459. Maximum sequence length: 2049, sample length: 2435 +[default0]:Skipping sample id=647485. Maximum sequence length: 2049, sample length: 3029 +[default0]:Skipping sample id=1331506. Maximum sequence length: 2049, sample length: 3360 +[default0]:Skipping sample id=837967. Maximum sequence length: 2049, sample length: 2395 +[default0]:Skipping sample id=592366. Maximum sequence length: 2049, sample length: 2129 +[default0]:Skipping sample id=877667. Maximum sequence length: 2049, sample length: 2058 +[default0]:Skipping sample id=225363. Maximum sequence length: 2049, sample length: 2641 +[default0]:Skipping sample id=240797. Maximum sequence length: 2049, sample length: 2274 +[default0]:Skipping sample id=745361. Maximum sequence length: 2049, sample length: 2362 +[default0]:Skipping sample id=1408151. Maximum sequence length: 2049, sample length: 2102 +[default0]:Skipping sample id=1380843. Maximum sequence length: 2049, sample length: 2185 +[default0]:Skipping sample id=1186412. Maximum sequence length: 2049, sample length: 3306 +[default0]:Skipping sample id=66657. Maximum sequence length: 2049, sample length: 3743 +[default0]:Skipping sample id=256043. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=116653. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=1335402. Maximum sequence length: 2049, sample length: 4381 +[default0]:Skipping sample id=1295478. Maximum sequence length: 2049, sample length: 2167 +[default0]:Skipping sample id=888379. Maximum sequence length: 2049, sample length: 6571 +[default0]:Skipping sample id=1130972. Maximum sequence length: 2049, sample length: 2361 +[default0]:Skipping sample id=180213. Maximum sequence length: 2049, sample length: 2430 +[default0]:Skipping sample id=166453. Maximum sequence length: 2049, sample length: 2253 +[default0]:Skipping sample id=670741. Maximum sequence length: 2049, sample length: 2847 +[default0]:Skipping sample id=196954. Maximum sequence length: 2049, sample length: 2411 +[default0]:Skipping sample id=49272. Maximum sequence length: 2049, sample length: 2805 +[default0]:Skipping sample id=1236154. Maximum sequence length: 2049, sample length: 4391 +[default0]:Skipping sample id=656726. Maximum sequence length: 2049, sample length: 2406 +[default0]:Skipping sample id=1025804. Maximum sequence length: 2049, sample length: 2674 +[default0]:Skipping sample id=759452. Maximum sequence length: 2049, sample length: 2327 +[default0]:Skipping sample id=829409. Maximum sequence length: 2049, sample length: 4083 +[default0]:Skipping sample id=1117628. Maximum sequence length: 2049, sample length: 2177 +[default0]:Skipping sample id=457108. Maximum sequence length: 2049, sample length: 4006 +[default0]:Skipping sample id=1163945. Maximum sequence length: 2049, sample length: 3558 +[default0]:Skipping sample id=537551. Maximum sequence length: 2049, sample length: 3819 +[default0]:Skipping sample id=1421063. Maximum sequence length: 2049, sample length: 2098 +[default0]:Skipping sample id=1407048. Maximum sequence length: 2049, sample length: 2799 +[default0]:Skipping sample id=581687. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=1542325. Maximum sequence length: 2049, sample length: 3203 +[default0]:Skipping sample id=416659. Maximum sequence length: 2049, sample length: 2081 +[default0]:Skipping sample id=311687. Maximum sequence length: 2049, sample length: 2344 +[default0]:Skipping sample id=493711. Maximum sequence length: 2049, sample length: 2911 +[default0]:Skipping sample id=186741. Maximum sequence length: 2049, sample length: 2343 +[default0]:Skipping sample id=137113. Maximum sequence length: 2049, sample length: 2263 +[default0]:Skipping sample id=1415973. Maximum sequence length: 2049, sample length: 2237 +[default0]:Skipping sample id=1250129. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=883013. Maximum sequence length: 2049, sample length: 5221 +[default0]:Skipping sample id=1403713. Maximum sequence length: 2049, sample length: 4133 +[default0]:Skipping sample id=1292922. Maximum sequence length: 2049, sample length: 2116 +[default0]:Skipping sample id=818799. Maximum sequence length: 2049, sample length: 2617 +[default0]:Skipping sample id=759867. Maximum sequence length: 2049, sample length: 2323 +[default0]:Skipping sample id=752262. Maximum sequence length: 2049, sample length: 2209 +[default0]:Skipping sample id=383494. Maximum sequence length: 2049, sample length: 2462 +[default0]:Skipping sample id=588282. Maximum sequence length: 2049, sample length: 3057 +[default0]:Skipping sample id=891984. Maximum sequence length: 2049, sample length: 2274 +[default0]:Skipping sample id=653898. Maximum sequence length: 2049, sample length: 2397 +[default0]:Skipping sample id=1060366. Maximum sequence length: 2049, sample length: 4463 +[default0]:Skipping sample id=1550755. Maximum sequence length: 2049, sample length: 2361 +[default0]:Skipping sample id=1475644. Maximum sequence length: 2049, sample length: 4240 +[default0]:Skipping sample id=498709. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=1208495. Maximum sequence length: 2049, sample length: 2568 +[default0]:Skipping sample id=363205. Maximum sequence length: 2049, sample length: 2282 +[default0]:Skipping sample id=486508. Maximum sequence length: 2049, sample length: 2359 +[default0]:Skipping sample id=323406. Maximum sequence length: 2049, sample length: 2186 +[default0]:Skipping sample id=1199017. Maximum sequence length: 2049, sample length: 3200 +[default0]:Skipping sample id=8875. Maximum sequence length: 2049, sample length: 10044 +[default0]:Skipping sample id=392370. Maximum sequence length: 2049, sample length: 2511 +[default0]:Skipping sample id=1564937. Maximum sequence length: 2049, sample length: 4316 +[default0]:Skipping sample id=974667. Maximum sequence length: 2049, sample length: 2760 +[default0]:Skipping sample id=1225021. Maximum sequence length: 2049, sample length: 2464 +[default0]:Skipping sample id=1343811. Maximum sequence length: 2049, sample length: 3278 +[default0]:Skipping sample id=1552517. Maximum sequence length: 2049, sample length: 2406 +[default0]:Skipping sample id=1162780. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=285640. Maximum sequence length: 2049, sample length: 3279 +[default0]:Skipping sample id=349854. Maximum sequence length: 2049, sample length: 4363 +[default0]:Skipping sample id=198043. Maximum sequence length: 2049, sample length: 3145 +[default0]:Skipping sample id=679001. Maximum sequence length: 2049, sample length: 2350 +[default0]:Skipping sample id=558162. Maximum sequence length: 2049, sample length: 2954 +[default0]:Skipping sample id=365034. Maximum sequence length: 2049, sample length: 2109 +[default0]:Skipping sample id=1488123. Maximum sequence length: 2049, sample length: 2263 +[default0]:Skipping sample id=585175. Maximum sequence length: 2049, sample length: 2661 +[default0]:Skipping sample id=424805. Maximum sequence length: 2049, sample length: 2834 +[default0]:Skipping sample id=960970. Maximum sequence length: 2049, sample length: 2166 +[default0]:Skipping sample id=921779. Maximum sequence length: 2049, sample length: 3525 +[default0]:Skipping sample id=404085. Maximum sequence length: 2049, sample length: 3179 +[default0]:Skipping sample id=967266. Maximum sequence length: 2049, sample length: 4686 +[default0]:Skipping sample id=426076. Maximum sequence length: 2049, sample length: 2162 +[default0]:Skipping sample id=1184567. Maximum sequence length: 2049, sample length: 2605 +[default0]:Skipping sample id=962221. Maximum sequence length: 2049, sample length: 2860 +[default0]:Skipping sample id=1000221. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=806873. Maximum sequence length: 2049, sample length: 4088 +[default0]:Skipping sample id=1012035. Maximum sequence length: 2049, sample length: 2561 +[default0]:Skipping sample id=1337263. Maximum sequence length: 2049, sample length: 3704 +[default0]:Skipping sample id=801619. Maximum sequence length: 2049, sample length: 2665 +[default0]:Skipping sample id=769736. Maximum sequence length: 2049, sample length: 2169 +[default0]:Skipping sample id=347708. Maximum sequence length: 2049, sample length: 2588 +[default0]:Skipping sample id=678426. Maximum sequence length: 2049, sample length: 3688 +[default0]:Skipping sample id=1327888. Maximum sequence length: 2049, sample length: 2570 +[default0]:Skipping sample id=986479. Maximum sequence length: 2049, sample length: 2516 +[default0]:Skipping sample id=1572467. Maximum sequence length: 2049, sample length: 3534 +[default0]:Skipping sample id=556794. Maximum sequence length: 2049, sample length: 2229 +[default0]:Skipping sample id=185089. Maximum sequence length: 2049, sample length: 2247 +[default0]:Skipping sample id=1435269. Maximum sequence length: 2049, sample length: 4387 +[default0]:Skipping sample id=1040622. Maximum sequence length: 2049, sample length: 2757 +[default0]:Skipping sample id=335906. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=140435. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=1525404. Maximum sequence length: 2049, sample length: 2908 +[default0]:Skipping sample id=999856. Maximum sequence length: 2049, sample length: 2792 +[default0]:Skipping sample id=72052. Maximum sequence length: 2049, sample length: 2093 +[default0]:Skipping sample id=504606. Maximum sequence length: 2049, sample length: 3443 +[default0]:Skipping sample id=1051605. Maximum sequence length: 2049, sample length: 4570 +[default0]:Skipping sample id=1159644. Maximum sequence length: 2049, sample length: 2514 +[default0]:Skipping sample id=342232. Maximum sequence length: 2049, sample length: 2596 +[default0]:Skipping sample id=1025758. Maximum sequence length: 2049, sample length: 3357 +[default0]:Skipping sample id=1253476. Maximum sequence length: 2049, sample length: 2412 +[default0]:Skipping sample id=1351845. Maximum sequence length: 2049, sample length: 2333 +[default0]:Skipping sample id=1327777. Maximum sequence length: 2049, sample length: 5714 +[default0]:Skipping sample id=1429142. Maximum sequence length: 2049, sample length: 2889 +[default0]:Skipping sample id=1392341. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=1231821. Maximum sequence length: 2049, sample length: 2369 +[default0]:Skipping sample id=816343. Maximum sequence length: 2049, sample length: 2153 +[default0]:Skipping sample id=147593. Maximum sequence length: 2049, sample length: 3893 +[default0]:Skipping sample id=1444768. Maximum sequence length: 2049, sample length: 3131 +[default0]:Skipping sample id=867338. Maximum sequence length: 2049, sample length: 4771 +[default0]:Skipping sample id=1045783. Maximum sequence length: 2049, sample length: 2240 +[default0]:Skipping sample id=450657. Maximum sequence length: 2049, sample length: 3613 +[default0]:Skipping sample id=569636. Maximum sequence length: 2049, sample length: 2279 +[default0]:Skipping sample id=161148. Maximum sequence length: 2049, sample length: 4426 +[default0]:Skipping sample id=1561027. Maximum sequence length: 2049, sample length: 3470 +[default0]:Skipping sample id=1157864. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=1079157. Maximum sequence length: 2049, sample length: 5111 +[default0]:Skipping sample id=586540. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=1026603. Maximum sequence length: 2049, sample length: 2828 +[default0]:Skipping sample id=50852. Maximum sequence length: 2049, sample length: 2243 +[default0]:Skipping sample id=626081. Maximum sequence length: 2049, sample length: 2633 +[default0]:Skipping sample id=1466095. Maximum sequence length: 2049, sample length: 4036 +[default0]:Skipping sample id=625168. Maximum sequence length: 2049, sample length: 3780 +[default0]:Skipping sample id=1321017. Maximum sequence length: 2049, sample length: 2639 +[default0]:Skipping sample id=1334147. Maximum sequence length: 2049, sample length: 3083 +[default0]:Skipping sample id=1247281. Maximum sequence length: 2049, sample length: 3697 +[default0]:Skipping sample id=275252. Maximum sequence length: 2049, sample length: 2770 +[default0]:Skipping sample id=65873. Maximum sequence length: 2049, sample length: 2396 +[default0]:Skipping sample id=633360. Maximum sequence length: 2049, sample length: 2326 +[default0]:Skipping sample id=1014079. Maximum sequence length: 2049, sample length: 2242 +[default0]:Skipping sample id=503969. Maximum sequence length: 2049, sample length: 3582 +[default0]:Skipping sample id=805709. Maximum sequence length: 2049, sample length: 3630 +[default0]:Skipping sample id=155327. Maximum sequence length: 2049, sample length: 2482 +[default0]:Skipping sample id=1515069. Maximum sequence length: 2049, sample length: 2574 +[default0]:Skipping sample id=981292. Maximum sequence length: 2049, sample length: 2357 +[default0]:Skipping sample id=1216379. Maximum sequence length: 2049, sample length: 2469 +[default0]:Skipping sample id=1555562. Maximum sequence length: 2049, sample length: 2155 +[default0]:Skipping sample id=927874. Maximum sequence length: 2049, sample length: 3311 +[default0]:Skipping sample id=1231967. Maximum sequence length: 2049, sample length: 2374 +[default0]:Skipping sample id=244125. Maximum sequence length: 2049, sample length: 2741 +[default0]:Skipping sample id=1386891. Maximum sequence length: 2049, sample length: 2318 +[default0]:Skipping sample id=1096526. Maximum sequence length: 2049, sample length: 2412 +[default0]:Skipping sample id=1331964. Maximum sequence length: 2049, sample length: 2569 +[default0]:Skipping sample id=51836. Maximum sequence length: 2049, sample length: 3787 +[default0]:Skipping sample id=308723. Maximum sequence length: 2049, sample length: 2500 +[default0]:Skipping sample id=1552965. Maximum sequence length: 2049, sample length: 2156 +[default0]:Skipping sample id=602126. Maximum sequence length: 2049, sample length: 2742 +[default0]:Skipping sample id=958261. Maximum sequence length: 2049, sample length: 2216 +[default0]:Skipping sample id=587891. Maximum sequence length: 2049, sample length: 2528 +[default0]:Skipping sample id=1315744. Maximum sequence length: 2049, sample length: 2174 +[default0]:Skipping sample id=1529137. Maximum sequence length: 2049, sample length: 2385 +[default0]:Skipping sample id=887306. Maximum sequence length: 2049, sample length: 2314 +[default0]:Skipping sample id=1255651. Maximum sequence length: 2049, sample length: 3662 +[default0]:Skipping sample id=314332. Maximum sequence length: 2049, sample length: 3654 +[default0]:Skipping sample id=100052. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=799462. Maximum sequence length: 2049, sample length: 2299 +[default0]:Skipping sample id=195503. Maximum sequence length: 2049, sample length: 2220 +[default0]:Skipping sample id=428342. Maximum sequence length: 2049, sample length: 2156 +[default0]:Skipping sample id=872869. Maximum sequence length: 2049, sample length: 5359 +[default0]:Skipping sample id=1247238. Maximum sequence length: 2049, sample length: 2287 +[default0]:Skipping sample id=1381051. Maximum sequence length: 2049, sample length: 2946 +[default0]:Skipping sample id=234615. Maximum sequence length: 2049, sample length: 2578 +[default0]:Skipping sample id=994803. Maximum sequence length: 2049, sample length: 2195 +[default0]:Skipping sample id=1468223. Maximum sequence length: 2049, sample length: 3759 +[default0]:Skipping sample id=1363315. Maximum sequence length: 2049, sample length: 2142 +[default0]:Skipping sample id=774752. Maximum sequence length: 2049, sample length: 2115 +[default0]:Skipping sample id=107156. Maximum sequence length: 2049, sample length: 2564 +[default0]:Skipping sample id=108622. Maximum sequence length: 2049, sample length: 3967 +[default0]:Skipping sample id=1170116. Maximum sequence length: 2049, sample length: 2074 +[default0]:Skipping sample id=982041. Maximum sequence length: 2049, sample length: 2857 +[default0]:Skipping sample id=426599. Maximum sequence length: 2049, sample length: 4024 +[default0]:Skipping sample id=293834. Maximum sequence length: 2049, sample length: 3902 +[default0]:Skipping sample id=701897. Maximum sequence length: 2049, sample length: 4336 +[default0]:Skipping sample id=779220. Maximum sequence length: 2049, sample length: 2366 +[default0]:Skipping sample id=1341304. Maximum sequence length: 2049, sample length: 2483 +[default0]:Skipping sample id=640499. Maximum sequence length: 2049, sample length: 2471 +[default0]:Skipping sample id=123049. Maximum sequence length: 2049, sample length: 2401 +[default0]:Skipping sample id=314700. Maximum sequence length: 2049, sample length: 2449 +[default0]:Skipping sample id=922595. Maximum sequence length: 2049, sample length: 2368 +[default0]:Skipping sample id=569948. Maximum sequence length: 2049, sample length: 2569 +[default0]:Skipping sample id=440311. Maximum sequence length: 2049, sample length: 2650 +[default0]:Skipping sample id=592872. Maximum sequence length: 2049, sample length: 2057 +[default0]:Skipping sample id=893767. Maximum sequence length: 2049, sample length: 3120 +[default0]:Skipping sample id=1506972. Maximum sequence length: 2049, sample length: 2269 +[default0]:Skipping sample id=1069622. Maximum sequence length: 2049, sample length: 3528 +[default0]:Skipping sample id=1115198. Maximum sequence length: 2049, sample length: 5576 +[default0]:Skipping sample id=587859. Maximum sequence length: 2049, sample length: 3289 +[default0]:Skipping sample id=1330633. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=564610. Maximum sequence length: 2049, sample length: 3993 +[default0]:Skipping sample id=792232. Maximum sequence length: 2049, sample length: 2283 +[default0]:Skipping sample id=829625. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=313991. Maximum sequence length: 2049, sample length: 2112 +[default0]:Skipping sample id=830333. Maximum sequence length: 2049, sample length: 2629 +[default0]:Skipping sample id=1060816. Maximum sequence length: 2049, sample length: 3283 +[default0]:Skipping sample id=913324. Maximum sequence length: 2049, sample length: 3091 +[default0]:Skipping sample id=535223. Maximum sequence length: 2049, sample length: 2094 +[default0]:Skipping sample id=1179584. Maximum sequence length: 2049, sample length: 3003 +[default0]:Skipping sample id=973827. Maximum sequence length: 2049, sample length: 2220 +[default0]:Skipping sample id=1560401. Maximum sequence length: 2049, sample length: 3690 +[default0]:Skipping sample id=1337596. Maximum sequence length: 2049, sample length: 2160 +[default0]:Skipping sample id=1512696. Maximum sequence length: 2049, sample length: 2051 +[default0]:Skipping sample id=640967. Maximum sequence length: 2049, sample length: 4114 +[default0]:Skipping sample id=169344. Maximum sequence length: 2049, sample length: 2313 +[default0]:Skipping sample id=1436492. Maximum sequence length: 2049, sample length: 2060 +[default0]:Skipping sample id=1444383. Maximum sequence length: 2049, sample length: 2930 +[default0]:Skipping sample id=88645. Maximum sequence length: 2049, sample length: 2241 +[default0]:Skipping sample id=1411594. Maximum sequence length: 2049, sample length: 2916 +[default0]:Skipping sample id=778758. Maximum sequence length: 2049, sample length: 2081 +[default0]:Skipping sample id=635791. Maximum sequence length: 2049, sample length: 2431 +[default0]:Skipping sample id=568393. Maximum sequence length: 2049, sample length: 6346 +[default0]:Skipping sample id=1211889. Maximum sequence length: 2049, sample length: 2957 +[default0]:Skipping sample id=1112552. Maximum sequence length: 2049, sample length: 2724 +[default0]:Skipping sample id=1504521. Maximum sequence length: 2049, sample length: 3172 +[default0]:Skipping sample id=808035. Maximum sequence length: 2049, sample length: 3111 +[default0]:Skipping sample id=64833. Maximum sequence length: 2049, sample length: 2095 +[default0]:Skipping sample id=384069. Maximum sequence length: 2049, sample length: 2628 +[default0]:Skipping sample id=1512840. Maximum sequence length: 2049, sample length: 2836 +[default0]:Skipping sample id=1506727. Maximum sequence length: 2049, sample length: 2819 +[default0]:Skipping sample id=1026832. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=11097. Maximum sequence length: 2049, sample length: 2736 +[default0]:Skipping sample id=899867. Maximum sequence length: 2049, sample length: 4927 +[default0]:Skipping sample id=1269302. Maximum sequence length: 2049, sample length: 2877 +[default0]:Skipping sample id=1351816. Maximum sequence length: 2049, sample length: 2821 +[default0]:Skipping sample id=1491193. Maximum sequence length: 2049, sample length: 2876 +[default0]:Skipping sample id=1398854. Maximum sequence length: 2049, sample length: 2616 +[default0]:Skipping sample id=208754. Maximum sequence length: 2049, sample length: 3370 +[default0]:Skipping sample id=741773. Maximum sequence length: 2049, sample length: 2695 +[default0]:Skipping sample id=1531456. Maximum sequence length: 2049, sample length: 2140 +[default0]:Skipping sample id=196195. Maximum sequence length: 2049, sample length: 2143 +[default0]:Skipping sample id=284282. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=1349214. Maximum sequence length: 2049, sample length: 2139 +[default0]:Skipping sample id=753970. Maximum sequence length: 2049, sample length: 3232 +[default0]:Skipping sample id=593758. Maximum sequence length: 2049, sample length: 2129 +[default0]:Skipping sample id=444381. Maximum sequence length: 2049, sample length: 2557 +[default0]:Skipping sample id=430181. Maximum sequence length: 2049, sample length: 2371 +[default0]:Skipping sample id=1570020. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=1379705. Maximum sequence length: 2049, sample length: 2853 +[default0]:Skipping sample id=741976. Maximum sequence length: 2049, sample length: 2530 +[default0]:Skipping sample id=1433637. Maximum sequence length: 2049, sample length: 2194 +[default0]:Skipping sample id=1086232. Maximum sequence length: 2049, sample length: 2852 +[default0]:Skipping sample id=923349. Maximum sequence length: 2049, sample length: 3767 +[default0]:Skipping sample id=25266. Maximum sequence length: 2049, sample length: 2662 +[default0]:Skipping sample id=709505. Maximum sequence length: 2049, sample length: 2577 +[default0]:Skipping sample id=765812. Maximum sequence length: 2049, sample length: 2237 +[default0]:Skipping sample id=679628. Maximum sequence length: 2049, sample length: 2699 +[default0]:Skipping sample id=585494. Maximum sequence length: 2049, sample length: 5470 +[default0]:Skipping sample id=243575. Maximum sequence length: 2049, sample length: 3732 +[default0]:Skipping sample id=1495309. Maximum sequence length: 2049, sample length: 4743 +[default0]:Skipping sample id=1359259. Maximum sequence length: 2049, sample length: 4392 +[default0]:Skipping sample id=1095304. Maximum sequence length: 2049, sample length: 2102 +[default0]:Skipping sample id=855650. Maximum sequence length: 2049, sample length: 2246 +[default0]:Skipping sample id=825671. Maximum sequence length: 2049, sample length: 3860 +[default0]:Skipping sample id=933065. Maximum sequence length: 2049, sample length: 2092 +[default0]:Skipping sample id=603926. Maximum sequence length: 2049, sample length: 2452 +[default0]:Skipping sample id=1316722. Maximum sequence length: 2049, sample length: 2486 +[default0]:Skipping sample id=815944. Maximum sequence length: 2049, sample length: 3012 +[default0]:Skipping sample id=703057. Maximum sequence length: 2049, sample length: 2130 +[default0]:Skipping sample id=1173245. Maximum sequence length: 2049, sample length: 3611 +[default0]:Skipping sample id=1184099. Maximum sequence length: 2049, sample length: 3058 +[default0]:Skipping sample id=41351. Maximum sequence length: 2049, sample length: 3328 +[default0]:Skipping sample id=394580. Maximum sequence length: 2049, sample length: 2557 +[default0]:Skipping sample id=158238. Maximum sequence length: 2049, sample length: 3940 +[default0]:Skipping sample id=432492. Maximum sequence length: 2049, sample length: 2311 +[default0]:Skipping sample id=624261. Maximum sequence length: 2049, sample length: 2348 +[default0]:Skipping sample id=1248833. Maximum sequence length: 2049, sample length: 3270 +[default0]:Skipping sample id=495554. Maximum sequence length: 2049, sample length: 2868 +[default0]:Skipping sample id=1266691. Maximum sequence length: 2049, sample length: 2496 +[default0]:Skipping sample id=240049. Maximum sequence length: 2049, sample length: 3127 +[default0]:Skipping sample id=364220. Maximum sequence length: 2049, sample length: 2736 +[default0]:Skipping sample id=1312714. Maximum sequence length: 2049, sample length: 2397 +[default0]:Skipping sample id=633585. Maximum sequence length: 2049, sample length: 2122 +[default0]:Skipping sample id=591564. Maximum sequence length: 2049, sample length: 2686 +[default0]:Skipping sample id=144325. Maximum sequence length: 2049, sample length: 3959 +[default0]:Skipping sample id=540621. Maximum sequence length: 2049, sample length: 4524 +[default0]:Skipping sample id=391639. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=1137264. Maximum sequence length: 2049, sample length: 2554 +[default0]:Skipping sample id=959117. Maximum sequence length: 2049, sample length: 2685 +[default0]:Skipping sample id=1105924. Maximum sequence length: 2049, sample length: 3767 +[default0]:Skipping sample id=94221. Maximum sequence length: 2049, sample length: 2116 +[default0]:Skipping sample id=911745. Maximum sequence length: 2049, sample length: 3517 +[default0]:Skipping sample id=1462261. Maximum sequence length: 2049, sample length: 3333 +[default0]:Skipping sample id=918269. Maximum sequence length: 2049, sample length: 3131 +[default0]:Skipping sample id=1498440. Maximum sequence length: 2049, sample length: 2925 +[default0]:Skipping sample id=1207244. Maximum sequence length: 2049, sample length: 2712 +[default0]:Skipping sample id=1565215. Maximum sequence length: 2049, sample length: 2697 +[default0]:Skipping sample id=40653. Maximum sequence length: 2049, sample length: 2595 +[default0]:Skipping sample id=1081430. Maximum sequence length: 2049, sample length: 2808 +[default0]:Skipping sample id=1439262. Maximum sequence length: 2049, sample length: 3553 +[default0]:Skipping sample id=1039923. Maximum sequence length: 2049, sample length: 3918 +[default0]:Skipping sample id=77316. Maximum sequence length: 2049, sample length: 5936 +[default0]:Skipping sample id=447887. Maximum sequence length: 2049, sample length: 2554 +[default0]:Skipping sample id=1570624. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=857808. Maximum sequence length: 2049, sample length: 2300 +[default0]:Skipping sample id=707672. Maximum sequence length: 2049, sample length: 2939 +[default0]:Skipping sample id=459417. Maximum sequence length: 2049, sample length: 3180 +[default0]:Skipping sample id=43989. Maximum sequence length: 2049, sample length: 3177 +[default0]:Skipping sample id=258598. Maximum sequence length: 2049, sample length: 2930 +[default0]:Skipping sample id=986862. Maximum sequence length: 2049, sample length: 2135 +[default0]:Skipping sample id=102246. Maximum sequence length: 2049, sample length: 2816 +[default0]:Skipping sample id=1189191. Maximum sequence length: 2049, sample length: 2083 +[default0]:Skipping sample id=398655. Maximum sequence length: 2049, sample length: 3316 +[default0]:Skipping sample id=88361. Maximum sequence length: 2049, sample length: 4133 +[default0]:Skipping sample id=786130. Maximum sequence length: 2049, sample length: 2978 +[default0]:Skipping sample id=119636. Maximum sequence length: 2049, sample length: 4727 +[default0]:Skipping sample id=172889. Maximum sequence length: 2049, sample length: 3893 +[default0]:Skipping sample id=63191. Maximum sequence length: 2049, sample length: 2373 +[default0]:Skipping sample id=845665. Maximum sequence length: 2049, sample length: 2262 +[default0]:Skipping sample id=1173291. Maximum sequence length: 2049, sample length: 2442 +[default0]:Skipping sample id=733721. Maximum sequence length: 2049, sample length: 2474 +[default0]:Skipping sample id=65535. Maximum sequence length: 2049, sample length: 2342 +[default0]:Skipping sample id=1244607. Maximum sequence length: 2049, sample length: 2439 +[default0]:Skipping sample id=1293065. Maximum sequence length: 2049, sample length: 3352 +[default0]:Skipping sample id=738669. Maximum sequence length: 2049, sample length: 2599 +[default0]:Skipping sample id=577640. Maximum sequence length: 2049, sample length: 2254 +[default0]:Skipping sample id=1561452. Maximum sequence length: 2049, sample length: 2319 +[default0]:Skipping sample id=803424. Maximum sequence length: 2049, sample length: 4027 +[default0]:Skipping sample id=554022. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=1174059. Maximum sequence length: 2049, sample length: 4738 +[default0]:Skipping sample id=1426943. Maximum sequence length: 2049, sample length: 2698 +[default0]:Skipping sample id=232085. Maximum sequence length: 2049, sample length: 5288 +[default0]:Skipping sample id=439261. Maximum sequence length: 2049, sample length: 2491 +[default0]:Skipping sample id=1235941. Maximum sequence length: 2049, sample length: 2056 +[default0]:Skipping sample id=1542545. Maximum sequence length: 2049, sample length: 4956 +[default0]:Skipping sample id=597617. Maximum sequence length: 2049, sample length: 2100 +[default0]:Skipping sample id=187090. Maximum sequence length: 2049, sample length: 2876 +[default0]:Skipping sample id=215672. Maximum sequence length: 2049, sample length: 3388 +[default0]:Skipping sample id=334605. Maximum sequence length: 2049, sample length: 3436 +[default0]:Skipping sample id=985947. Maximum sequence length: 2049, sample length: 5154 +[default0]:Skipping sample id=950359. Maximum sequence length: 2049, sample length: 3129 +[default0]:Skipping sample id=540274. Maximum sequence length: 2049, sample length: 2099 +[default0]:Skipping sample id=1173449. Maximum sequence length: 2049, sample length: 2476 +[default0]:Skipping sample id=1419070. Maximum sequence length: 2049, sample length: 2575 +[default0]:Skipping sample id=859862. Maximum sequence length: 2049, sample length: 2770 +[default0]:Skipping sample id=1465123. Maximum sequence length: 2049, sample length: 2178 +[default0]:Skipping sample id=912411. Maximum sequence length: 2049, sample length: 3576 +[default0]:Skipping sample id=31290. Maximum sequence length: 2049, sample length: 6010 +[default0]:Skipping sample id=229909. Maximum sequence length: 2049, sample length: 2364 +[default0]:Skipping sample id=1398317. Maximum sequence length: 2049, sample length: 2636 +[default0]:Skipping sample id=221044. Maximum sequence length: 2049, sample length: 4881 +[default0]:Skipping sample id=1056756. Maximum sequence length: 2049, sample length: 4943 +[default0]:Skipping sample id=1263402. Maximum sequence length: 2049, sample length: 3488 +[default0]:Skipping sample id=1309373. Maximum sequence length: 2049, sample length: 5263 +[default0]:Skipping sample id=341350. Maximum sequence length: 2049, sample length: 2556 +[default0]:Skipping sample id=1147504. Maximum sequence length: 2049, sample length: 2122 +[default0]:Skipping sample id=977206. Maximum sequence length: 2049, sample length: 2658 +[default0]:Skipping sample id=168144. Maximum sequence length: 2049, sample length: 3218 +[default0]:Skipping sample id=1466531. Maximum sequence length: 2049, sample length: 2320 +[default0]:Skipping sample id=1132067. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=508646. Maximum sequence length: 2049, sample length: 3599 +[default0]:Skipping sample id=1539671. Maximum sequence length: 2049, sample length: 2137 +[default0]:Skipping sample id=1429262. Maximum sequence length: 2049, sample length: 4753 +[default0]:Skipping sample id=129931. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=1466674. Maximum sequence length: 2049, sample length: 2777 +[default0]:Skipping sample id=1009299. Maximum sequence length: 2049, sample length: 2524 +[default0]:Skipping sample id=378483. Maximum sequence length: 2049, sample length: 2829 +[default0]:Skipping sample id=1551794. Maximum sequence length: 2049, sample length: 2493 +[default0]:Skipping sample id=345543. Maximum sequence length: 2049, sample length: 2138 +[default0]:Skipping sample id=200848. Maximum sequence length: 2049, sample length: 2549 +[default0]:Skipping sample id=408395. Maximum sequence length: 2049, sample length: 3480 +[default0]:Skipping sample id=95606. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=803385. Maximum sequence length: 2049, sample length: 3463 +[default0]:Skipping sample id=777485. Maximum sequence length: 2049, sample length: 2258 +[default0]:Skipping sample id=510055. Maximum sequence length: 2049, sample length: 4485 +[default0]:Skipping sample id=194070. Maximum sequence length: 2049, sample length: 2838 +[default0]:Skipping sample id=1069406. Maximum sequence length: 2049, sample length: 2992 +[default0]:Skipping sample id=795144. Maximum sequence length: 2049, sample length: 2409 +[default0]:Skipping sample id=658157. Maximum sequence length: 2049, sample length: 2717 +[default0]:Skipping sample id=106872. Maximum sequence length: 2049, sample length: 2324 +[default0]:Skipping sample id=103770. Maximum sequence length: 2049, sample length: 2639 +[default0]:Skipping sample id=1326314. Maximum sequence length: 2049, sample length: 2794 +[default0]:Skipping sample id=787944. Maximum sequence length: 2049, sample length: 4481 +[default0]:Skipping sample id=295775. Maximum sequence length: 2049, sample length: 2518 +[default0]:Skipping sample id=879939. Maximum sequence length: 2049, sample length: 3224 +[default0]:Skipping sample id=1200644. Maximum sequence length: 2049, sample length: 3511 +[default0]:Skipping sample id=1298049. Maximum sequence length: 2049, sample length: 2448 +[default0]:Skipping sample id=830303. Maximum sequence length: 2049, sample length: 2319 +[default0]:Skipping sample id=573821. Maximum sequence length: 2049, sample length: 3083 +[default0]:Skipping sample id=939090. Maximum sequence length: 2049, sample length: 3375 +[default0]:Skipping sample id=495407. Maximum sequence length: 2049, sample length: 3556 +[default0]:Skipping sample id=51994. Maximum sequence length: 2049, sample length: 4690 +[default0]:Skipping sample id=1402724. Maximum sequence length: 2049, sample length: 2669 +[default0]:Skipping sample id=1184090. Maximum sequence length: 2049, sample length: 3088 +[default0]:Skipping sample id=890495. Maximum sequence length: 2049, sample length: 2092 +[default0]:Skipping sample id=473012. Maximum sequence length: 2049, sample length: 4760 +[default0]:Skipping sample id=505654. Maximum sequence length: 2049, sample length: 2721 +[default0]:Skipping sample id=483759. Maximum sequence length: 2049, sample length: 4101 +[default0]:Skipping sample id=1503547. Maximum sequence length: 2049, sample length: 3814 +[default0]:Skipping sample id=366840. Maximum sequence length: 2049, sample length: 2852 +[default0]:Skipping sample id=426804. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=1326631. Maximum sequence length: 2049, sample length: 3566 +[default0]:Skipping sample id=991689. Maximum sequence length: 2049, sample length: 3441 +[default0]:Skipping sample id=1238352. Maximum sequence length: 2049, sample length: 4335 +[default0]:Skipping sample id=1180732. Maximum sequence length: 2049, sample length: 2340 +[default0]:Skipping sample id=1089204. Maximum sequence length: 2049, sample length: 2100 +[default0]:Skipping sample id=1210592. Maximum sequence length: 2049, sample length: 2084 +[default0]:Skipping sample id=1347391. Maximum sequence length: 2049, sample length: 2649 +[default0]:Skipping sample id=187458. Maximum sequence length: 2049, sample length: 3833 +[default0]:Skipping sample id=1034932. Maximum sequence length: 2049, sample length: 2250 +[default0]:Skipping sample id=1249892. Maximum sequence length: 2049, sample length: 2897 +[default0]:Skipping sample id=566004. Maximum sequence length: 2049, sample length: 2244 +[default0]:Skipping sample id=521365. Maximum sequence length: 2049, sample length: 3100 +[default0]:Skipping sample id=1220138. Maximum sequence length: 2049, sample length: 2923 +[default0]:Skipping sample id=938167. Maximum sequence length: 2049, sample length: 5769 +[default0]:Skipping sample id=441518. Maximum sequence length: 2049, sample length: 2402 +[default0]:Skipping sample id=1394701. Maximum sequence length: 2049, sample length: 3089 +[default0]:Skipping sample id=680417. Maximum sequence length: 2049, sample length: 3422 +[default0]:Skipping sample id=1497219. Maximum sequence length: 2049, sample length: 2567 +[default0]:Skipping sample id=486768. Maximum sequence length: 2049, sample length: 2306 +[default0]:Skipping sample id=1248168. Maximum sequence length: 2049, sample length: 2681 +[default0]:Skipping sample id=1505261. Maximum sequence length: 2049, sample length: 2150 +[default0]:Skipping sample id=1215420. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=455205. Maximum sequence length: 2049, sample length: 2248 +[default0]:Skipping sample id=41457. Maximum sequence length: 2049, sample length: 4141 +[default0]:Skipping sample id=1138907. Maximum sequence length: 2049, sample length: 3717 +[default0]:Skipping sample id=354941. Maximum sequence length: 2049, sample length: 4005 +[default0]:Skipping sample id=1051064. Maximum sequence length: 2049, sample length: 3388 +[default0]:Skipping sample id=817817. Maximum sequence length: 2049, sample length: 4055 +[default0]:Skipping sample id=721072. Maximum sequence length: 2049, sample length: 2265 +[default0]:Skipping sample id=627294. Maximum sequence length: 2049, sample length: 2846 +[default0]:Skipping sample id=389841. Maximum sequence length: 2049, sample length: 2398 +[default0]:Skipping sample id=995485. Maximum sequence length: 2049, sample length: 2706 +[default0]:Skipping sample id=1280566. Maximum sequence length: 2049, sample length: 3460 +[default0]:Skipping sample id=747129. Maximum sequence length: 2049, sample length: 3734 +[default0]:Skipping sample id=1068050. Maximum sequence length: 2049, sample length: 2258 +[default0]:Skipping sample id=461445. Maximum sequence length: 2049, sample length: 4324 +[default0]:Skipping sample id=1472816. Maximum sequence length: 2049, sample length: 2183 +[default0]:Skipping sample id=511700. Maximum sequence length: 2049, sample length: 4930 +[default0]:Skipping sample id=890164. Maximum sequence length: 2049, sample length: 2725 +[default0]:Skipping sample id=388313. Maximum sequence length: 2049, sample length: 2801 +[default0]:Skipping sample id=1260634. Maximum sequence length: 2049, sample length: 3339 +[default0]:Skipping sample id=796724. Maximum sequence length: 2049, sample length: 2058 +[default0]:Skipping sample id=610878. Maximum sequence length: 2049, sample length: 2208 +[default0]:Skipping sample id=878818. Maximum sequence length: 2049, sample length: 3210 +[default0]:Skipping sample id=369331. Maximum sequence length: 2049, sample length: 6115 +[default0]:Skipping sample id=352519. Maximum sequence length: 2049, sample length: 2115 +[default0]:Skipping sample id=1191584. Maximum sequence length: 2049, sample length: 5743 +[default0]:Skipping sample id=62310. Maximum sequence length: 2049, sample length: 2420 +[default0]:Skipping sample id=236672. Maximum sequence length: 2049, sample length: 2746 +[default0]:Skipping sample id=290587. Maximum sequence length: 2049, sample length: 3031 +[default0]:Skipping sample id=1339036. Maximum sequence length: 2049, sample length: 4059 +[default0]:Skipping sample id=516597. Maximum sequence length: 2049, sample length: 2831 +[default0]:Skipping sample id=1532958. Maximum sequence length: 2049, sample length: 2455 +[default0]:Skipping sample id=830977. Maximum sequence length: 2049, sample length: 3289 +[default0]:Skipping sample id=445223. Maximum sequence length: 2049, sample length: 3967 +[default0]:Skipping sample id=392898. Maximum sequence length: 2049, sample length: 4516 +[default0]:Skipping sample id=1116486. Maximum sequence length: 2049, sample length: 3665 +[default0]:Skipping sample id=121564. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=773872. Maximum sequence length: 2049, sample length: 3733 +[default0]:Skipping sample id=130803. Maximum sequence length: 2049, sample length: 3511 +[default0]:Skipping sample id=1538398. Maximum sequence length: 2049, sample length: 2148 +[default0]:Skipping sample id=1521530. Maximum sequence length: 2049, sample length: 4410 +[default0]:Skipping sample id=179524. Maximum sequence length: 2049, sample length: 2615 +[default0]:Skipping sample id=345889. Maximum sequence length: 2049, sample length: 3870 +[default0]:Skipping sample id=629589. Maximum sequence length: 2049, sample length: 2473 +[default0]:Skipping sample id=799626. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=574331. Maximum sequence length: 2049, sample length: 5109 +[default0]:Skipping sample id=1450063. Maximum sequence length: 2049, sample length: 3150 +[default0]:Skipping sample id=1474018. Maximum sequence length: 2049, sample length: 2303 +[default0]:Skipping sample id=1232081. Maximum sequence length: 2049, sample length: 2458 +[default0]:Skipping sample id=146246. Maximum sequence length: 2049, sample length: 3168 +[default0]:Skipping sample id=876076. Maximum sequence length: 2049, sample length: 2089 +[default0]:Skipping sample id=1474612. Maximum sequence length: 2049, sample length: 3337 +[default0]:Skipping sample id=1210465. Maximum sequence length: 2049, sample length: 2129 +[default0]:Skipping sample id=68711. Maximum sequence length: 2049, sample length: 3065 +[default0]:Skipping sample id=1119126. Maximum sequence length: 2049, sample length: 4157 +[default0]:Skipping sample id=490300. Maximum sequence length: 2049, sample length: 2216 +[default0]:Skipping sample id=1105972. Maximum sequence length: 2049, sample length: 3763 +[default0]:Skipping sample id=14833. Maximum sequence length: 2049, sample length: 2228 +[default0]:Skipping sample id=495230. Maximum sequence length: 2049, sample length: 3395 +[default0]:Skipping sample id=274911. Maximum sequence length: 2049, sample length: 2421 +[default0]:Skipping sample id=213437. Maximum sequence length: 2049, sample length: 2247 +[default0]:Skipping sample id=679738. Maximum sequence length: 2049, sample length: 5274 +[default0]:Skipping sample id=298477. Maximum sequence length: 2049, sample length: 3397 +[default0]:Skipping sample id=132372. Maximum sequence length: 2049, sample length: 3114 +[default0]:Skipping sample id=912089. Maximum sequence length: 2049, sample length: 2445 +[default0]:Skipping sample id=287638. Maximum sequence length: 2049, sample length: 2148 +[default0]:Skipping sample id=1561933. Maximum sequence length: 2049, sample length: 2359 +[default0]:Skipping sample id=8257. Maximum sequence length: 2049, sample length: 4138 +[default0]:Skipping sample id=301436. Maximum sequence length: 2049, sample length: 3051 +[default0]:Skipping sample id=547645. Maximum sequence length: 2049, sample length: 2402 +[default0]:Skipping sample id=1483093. Maximum sequence length: 2049, sample length: 3385 +[default0]:Skipping sample id=596088. Maximum sequence length: 2049, sample length: 2114 +[default0]:Skipping sample id=1555850. Maximum sequence length: 2049, sample length: 2660 +[default0]:Skipping sample id=700765. Maximum sequence length: 2049, sample length: 2720 +[default0]:Skipping sample id=526068. Maximum sequence length: 2049, sample length: 2164 +[default0]:Skipping sample id=237498. Maximum sequence length: 2049, sample length: 2737 +[default0]:Skipping sample id=1402905. Maximum sequence length: 2049, sample length: 2311 +[default0]:Skipping sample id=765760. Maximum sequence length: 2049, sample length: 3223 +[default0]:Skipping sample id=1161579. Maximum sequence length: 2049, sample length: 2257 +[default0]:Skipping sample id=787349. Maximum sequence length: 2049, sample length: 2323 +[default0]:Skipping sample id=701287. Maximum sequence length: 2049, sample length: 2771 +[default0]:Skipping sample id=1216804. Maximum sequence length: 2049, sample length: 2622 +[default0]:Skipping sample id=558396. Maximum sequence length: 2049, sample length: 2514 +[default0]:Skipping sample id=1141600. Maximum sequence length: 2049, sample length: 2533 +[default0]:Skipping sample id=200496. Maximum sequence length: 2049, sample length: 2082 +[default0]:Skipping sample id=830064. Maximum sequence length: 2049, sample length: 2743 +[default0]:Skipping sample id=918574. Maximum sequence length: 2049, sample length: 2655 +[default0]:Skipping sample id=1481018. Maximum sequence length: 2049, sample length: 2280 +[default0]:Skipping sample id=1122217. Maximum sequence length: 2049, sample length: 2664 +[default0]:Skipping sample id=154058. Maximum sequence length: 2049, sample length: 2197 +[default0]:Skipping sample id=1173356. Maximum sequence length: 2049, sample length: 2844 +[default0]:Skipping sample id=118061. Maximum sequence length: 2049, sample length: 2095 +[default0]:Skipping sample id=1480443. Maximum sequence length: 2049, sample length: 2565 +[default0]:Skipping sample id=830557. Maximum sequence length: 2049, sample length: 2086 +[default0]:Skipping sample id=799824. Maximum sequence length: 2049, sample length: 2242 +[default0]:Skipping sample id=445330. Maximum sequence length: 2049, sample length: 4387 +[default0]:Skipping sample id=883462. Maximum sequence length: 2049, sample length: 2145 +[default0]:Skipping sample id=474028. Maximum sequence length: 2049, sample length: 2787 +[default0]:Skipping sample id=161663. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=60494. Maximum sequence length: 2049, sample length: 3548 +[default0]:Skipping sample id=1085391. Maximum sequence length: 2049, sample length: 2389 +[default0]:Skipping sample id=671753. Maximum sequence length: 2049, sample length: 4811 +[default0]:Skipping sample id=253124. Maximum sequence length: 2049, sample length: 2825 +[default0]:Skipping sample id=1382633. Maximum sequence length: 2049, sample length: 2804 +[default0]:Skipping sample id=1370888. Maximum sequence length: 2049, sample length: 3332 +[default0]:Skipping sample id=883036. Maximum sequence length: 2049, sample length: 2340 +[default0]:Skipping sample id=1473356. Maximum sequence length: 2049, sample length: 2285 +[default0]:Skipping sample id=669159. Maximum sequence length: 2049, sample length: 3880 +[default0]:Skipping sample id=273278. Maximum sequence length: 2049, sample length: 2904 +[default0]:Skipping sample id=689322. Maximum sequence length: 2049, sample length: 2455 +[default0]:Skipping sample id=1247558. Maximum sequence length: 2049, sample length: 2077 +[default0]:Skipping sample id=1148763. Maximum sequence length: 2049, sample length: 5059 +[default0]:Skipping sample id=1002616. Maximum sequence length: 2049, sample length: 2063 +[default0]:Skipping sample id=1229904. Maximum sequence length: 2049, sample length: 2920 +[default0]:Skipping sample id=144104. Maximum sequence length: 2049, sample length: 3024 +[default0]:Skipping sample id=336841. Maximum sequence length: 2049, sample length: 4059 +[default0]:Skipping sample id=96641. Maximum sequence length: 2049, sample length: 2576 +[default0]:Skipping sample id=856318. Maximum sequence length: 2049, sample length: 3997 +[default0]:Skipping sample id=49153. Maximum sequence length: 2049, sample length: 2305 +[default0]:Skipping sample id=979337. Maximum sequence length: 2049, sample length: 2363 +[default0]:Skipping sample id=592684. Maximum sequence length: 2049, sample length: 4960 +[default0]:Skipping sample id=446534. Maximum sequence length: 2049, sample length: 5776 +[default0]:Skipping sample id=1305602. Maximum sequence length: 2049, sample length: 4585 +[default0]:Skipping sample id=1390827. Maximum sequence length: 2049, sample length: 2619 +[default0]:Skipping sample id=1210203. Maximum sequence length: 2049, sample length: 3341 +[default0]:Skipping sample id=1482423. Maximum sequence length: 2049, sample length: 2105 +[default0]:Skipping sample id=1184184. Maximum sequence length: 2049, sample length: 3573 +[default0]:Skipping sample id=740564. Maximum sequence length: 2049, sample length: 3264 +[default0]:Skipping sample id=298308. Maximum sequence length: 2049, sample length: 2053 +[default0]:Skipping sample id=583457. Maximum sequence length: 2049, sample length: 2510 +[default0]:Skipping sample id=1573549. Maximum sequence length: 2049, sample length: 2513 +[default0]:Skipping sample id=249160. Maximum sequence length: 2049, sample length: 2560 +[default0]:Skipping sample id=162099. Maximum sequence length: 2049, sample length: 2052 +[default0]:Skipping sample id=532830. Maximum sequence length: 2049, sample length: 4367 +[default0]:Skipping sample id=616476. Maximum sequence length: 2049, sample length: 3135 +[default0]:Skipping sample id=1236417. Maximum sequence length: 2049, sample length: 3287 +[default0]:Skipping sample id=1103082. Maximum sequence length: 2049, sample length: 2392 +[default0]:Skipping sample id=1105732. Maximum sequence length: 2049, sample length: 2278 +[default0]:Skipping sample id=1508911. Maximum sequence length: 2049, sample length: 2644 +[default0]:Skipping sample id=1143915. Maximum sequence length: 2049, sample length: 3460 +[default0]:Skipping sample id=1147853. Maximum sequence length: 2049, sample length: 4266 +[default0]:Skipping sample id=679553. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=719005. Maximum sequence length: 2049, sample length: 2276 +[default0]:Skipping sample id=1242590. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=1263820. Maximum sequence length: 2049, sample length: 2530 +[default0]:Skipping sample id=1338118. Maximum sequence length: 2049, sample length: 2321 +[default0]:Skipping sample id=921875. Maximum sequence length: 2049, sample length: 2913 +[default0]:Skipping sample id=155463. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=1175910. Maximum sequence length: 2049, sample length: 2137 +[default0]:Skipping sample id=1517706. Maximum sequence length: 2049, sample length: 2147 +[default0]:Skipping sample id=1542211. Maximum sequence length: 2049, sample length: 2311 +[default0]:Skipping sample id=1149674. Maximum sequence length: 2049, sample length: 5467 +[default0]:Skipping sample id=947528. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=865505. Maximum sequence length: 2049, sample length: 2139 +[default0]:Skipping sample id=663396. Maximum sequence length: 2049, sample length: 2126 +[default0]:Skipping sample id=1567136. Maximum sequence length: 2049, sample length: 4979 +[default0]:Skipping sample id=516323. Maximum sequence length: 2049, sample length: 2660 +[default0]:Skipping sample id=70745. Maximum sequence length: 2049, sample length: 2473 +[default0]:Skipping sample id=182892. Maximum sequence length: 2049, sample length: 2455 +[default0]:Skipping sample id=1204038. Maximum sequence length: 2049, sample length: 2359 +[default0]:Skipping sample id=1512211. Maximum sequence length: 2049, sample length: 2240 +[default0]:Skipping sample id=672954. Maximum sequence length: 2049, sample length: 2447 +[default0]:Skipping sample id=626241. Maximum sequence length: 2049, sample length: 2539 +[default0]:Skipping sample id=687638. Maximum sequence length: 2049, sample length: 2758 +[default0]:Skipping sample id=1536805. Maximum sequence length: 2049, sample length: 2604 +[default0]:Skipping sample id=1302086. Maximum sequence length: 2049, sample length: 2353 +[default0]:Skipping sample id=577165. Maximum sequence length: 2049, sample length: 4185 +[default0]:Skipping sample id=1280254. Maximum sequence length: 2049, sample length: 6309 +[default0]:Skipping sample id=728698. Maximum sequence length: 2049, sample length: 2401 +[default0]:Skipping sample id=94987. Maximum sequence length: 2049, sample length: 4278 +[default0]:Skipping sample id=325219. Maximum sequence length: 2049, sample length: 2533 +[default0]:Skipping sample id=552078. Maximum sequence length: 2049, sample length: 2382 +[default0]:Skipping sample id=379110. Maximum sequence length: 2049, sample length: 2812 +[default0]:Skipping sample id=34797. Maximum sequence length: 2049, sample length: 2908 +[default0]:Skipping sample id=252780. Maximum sequence length: 2049, sample length: 2276 +[default0]:Skipping sample id=1378694. Maximum sequence length: 2049, sample length: 2171 +[default0]:Skipping sample id=804682. Maximum sequence length: 2049, sample length: 3542 +[default0]:Skipping sample id=939717. Maximum sequence length: 2049, sample length: 2407 +[default0]:Skipping sample id=800957. Maximum sequence length: 2049, sample length: 2467 +[default0]:Skipping sample id=393993. Maximum sequence length: 2049, sample length: 3724 +[default0]:Skipping sample id=1463373. Maximum sequence length: 2049, sample length: 2849 +[default0]:Skipping sample id=716821. Maximum sequence length: 2049, sample length: 2696 +[default0]:Skipping sample id=86527. Maximum sequence length: 2049, sample length: 3293 +[default0]:Skipping sample id=430820. Maximum sequence length: 2049, sample length: 3864 +[default0]:Skipping sample id=1121083. Maximum sequence length: 2049, sample length: 5519 +[default0]:Skipping sample id=725998. Maximum sequence length: 2049, sample length: 3658 +[default0]:Skipping sample id=520916. Maximum sequence length: 2049, sample length: 2329 +[default0]:Skipping sample id=1286894. Maximum sequence length: 2049, sample length: 2406 +[default0]:Skipping sample id=746875. Maximum sequence length: 2049, sample length: 2531 +[default0]:Skipping sample id=1090323. Maximum sequence length: 2049, sample length: 3206 +[default0]:Skipping sample id=461807. Maximum sequence length: 2049, sample length: 2301 +[default0]:Skipping sample id=200261. Maximum sequence length: 2049, sample length: 2705 +[default0]:Skipping sample id=414377. Maximum sequence length: 2049, sample length: 2307 +[default0]:Skipping sample id=591160. Maximum sequence length: 2049, sample length: 3067 +[default0]:Skipping sample id=1039311. Maximum sequence length: 2049, sample length: 5059 +[default0]:Skipping sample id=326787. Maximum sequence length: 2049, sample length: 2055 +[default0]:Skipping sample id=1317293. Maximum sequence length: 2049, sample length: 2604 +[default0]:Skipping sample id=1417611. Maximum sequence length: 2049, sample length: 2118 +[default0]:Skipping sample id=191202. Maximum sequence length: 2049, sample length: 2113 +[default0]:Skipping sample id=652581. Maximum sequence length: 2049, sample length: 2753 +[default0]:Skipping sample id=166783. Maximum sequence length: 2049, sample length: 2763 +[default0]:Skipping sample id=713959. Maximum sequence length: 2049, sample length: 2819 +[default0]:Skipping sample id=49026. Maximum sequence length: 2049, sample length: 2585 +[default0]:Skipping sample id=747620. Maximum sequence length: 2049, sample length: 2094 +[default0]:Skipping sample id=210310. Maximum sequence length: 2049, sample length: 2653 +[default0]:Skipping sample id=820151. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=1136649. Maximum sequence length: 2049, sample length: 2236 +[default0]:Skipping sample id=151380. Maximum sequence length: 2049, sample length: 2868 +[default0]:Skipping sample id=494416. Maximum sequence length: 2049, sample length: 4284 +[default0]:Skipping sample id=854643. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=1303304. Maximum sequence length: 2049, sample length: 3797 +[default0]:Skipping sample id=1350804. Maximum sequence length: 2049, sample length: 2247 +[default0]:Skipping sample id=388811. Maximum sequence length: 2049, sample length: 2682 +[default0]:Skipping sample id=926869. Maximum sequence length: 2049, sample length: 2857 +[default0]:Skipping sample id=1459289. Maximum sequence length: 2049, sample length: 3059 +[default0]:Skipping sample id=1209500. Maximum sequence length: 2049, sample length: 2242 +[default0]:Skipping sample id=336248. Maximum sequence length: 2049, sample length: 2265 +[default0]:Skipping sample id=937056. Maximum sequence length: 2049, sample length: 3925 +[default0]:Skipping sample id=276223. Maximum sequence length: 2049, sample length: 3415 +[default0]:Skipping sample id=1426869. Maximum sequence length: 2049, sample length: 3453 +[default0]:Skipping sample id=992670. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=1510383. Maximum sequence length: 2049, sample length: 3126 +[default0]:Skipping sample id=80319. Maximum sequence length: 2049, sample length: 2395 +[default0]:Skipping sample id=82440. Maximum sequence length: 2049, sample length: 2588 +[default0]:Skipping sample id=490752. Maximum sequence length: 2049, sample length: 2497 +[default0]:Skipping sample id=1562992. Maximum sequence length: 2049, sample length: 2577 +[default0]:Skipping sample id=880383. Maximum sequence length: 2049, sample length: 3508 +[default0]:Skipping sample id=1299256. Maximum sequence length: 2049, sample length: 2171 +[default0]:Skipping sample id=960188. Maximum sequence length: 2049, sample length: 10717 +[default0]:Skipping sample id=282850. Maximum sequence length: 2049, sample length: 2130 +[default0]:Skipping sample id=391123. Maximum sequence length: 2049, sample length: 2593 +[default0]:Skipping sample id=275198. Maximum sequence length: 2049, sample length: 3847 +[default0]:Skipping sample id=849558. Maximum sequence length: 2049, sample length: 2816 +[default0]:Skipping sample id=362092. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=440463. Maximum sequence length: 2049, sample length: 2146 +[default0]:Skipping sample id=617208. Maximum sequence length: 2049, sample length: 3203 +[default0]:Skipping sample id=1147784. Maximum sequence length: 2049, sample length: 3450 +[default0]:Skipping sample id=659846. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=554283. Maximum sequence length: 2049, sample length: 2151 +[default0]:Skipping sample id=771063. Maximum sequence length: 2049, sample length: 2653 +[default0]:Skipping sample id=978410. Maximum sequence length: 2049, sample length: 2355 +[default0]:Skipping sample id=160550. Maximum sequence length: 2049, sample length: 2530 +[default0]:Skipping sample id=332562. Maximum sequence length: 2049, sample length: 3391 +[default0]:Skipping sample id=540811. Maximum sequence length: 2049, sample length: 2699 +[default0]:Skipping sample id=413433. Maximum sequence length: 2049, sample length: 5165 +[default0]:Skipping sample id=784673. Maximum sequence length: 2049, sample length: 2265 +[default0]:Skipping sample id=1476435. Maximum sequence length: 2049, sample length: 4020 +[default0]:Skipping sample id=224321. Maximum sequence length: 2049, sample length: 2237 +[default0]:Skipping sample id=855572. Maximum sequence length: 2049, sample length: 2342 +[default0]:Skipping sample id=462293. Maximum sequence length: 2049, sample length: 2055 +[default0]:Skipping sample id=1175272. Maximum sequence length: 2049, sample length: 2801 +[default0]:Skipping sample id=676513. Maximum sequence length: 2049, sample length: 3143 +[default0]:Skipping sample id=900826. Maximum sequence length: 2049, sample length: 3439 +[default0]:Skipping sample id=1202322. Maximum sequence length: 2049, sample length: 2061 +[default0]:Skipping sample id=1161408. Maximum sequence length: 2049, sample length: 2512 +[default0]:Skipping sample id=22018. Maximum sequence length: 2049, sample length: 4230 +[default0]:Skipping sample id=315225. Maximum sequence length: 2049, sample length: 3104 +[default0]:Skipping sample id=362574. Maximum sequence length: 2049, sample length: 3822 +[default0]:Skipping sample id=693903. Maximum sequence length: 2049, sample length: 2285 +[default0]:Skipping sample id=461309. Maximum sequence length: 2049, sample length: 2817 +[default0]:Skipping sample id=692329. Maximum sequence length: 2049, sample length: 2939 +[default0]:Skipping sample id=170750. Maximum sequence length: 2049, sample length: 2788 +[default0]:Skipping sample id=1527236. Maximum sequence length: 2049, sample length: 2137 +[default0]:Skipping sample id=472938. Maximum sequence length: 2049, sample length: 2757 +[default0]:Skipping sample id=326095. Maximum sequence length: 2049, sample length: 2056 +[default0]:Skipping sample id=757399. Maximum sequence length: 2049, sample length: 2770 +[default0]:Skipping sample id=527097. Maximum sequence length: 2049, sample length: 2326 +[default0]:Skipping sample id=1076086. Maximum sequence length: 2049, sample length: 2579 +[default0]:Skipping sample id=900649. Maximum sequence length: 2049, sample length: 2582 +[default0]:Skipping sample id=977576. Maximum sequence length: 2049, sample length: 2305 +[default0]:Skipping sample id=1284322. Maximum sequence length: 2049, sample length: 2953 +[default0]:Skipping sample id=835880. Maximum sequence length: 2049, sample length: 3549 +[default0]:Skipping sample id=20383. Maximum sequence length: 2049, sample length: 2128 +[default0]:Skipping sample id=1148232. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=400374. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=634689. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=1367707. Maximum sequence length: 2049, sample length: 4785 +[default0]:Skipping sample id=1178945. Maximum sequence length: 2049, sample length: 2223 +[default0]:Skipping sample id=277296. Maximum sequence length: 2049, sample length: 3804 +[default0]:Skipping sample id=101496. Maximum sequence length: 2049, sample length: 2214 +[default0]:Skipping sample id=1268185. Maximum sequence length: 2049, sample length: 2800 +[default0]:Skipping sample id=1205360. Maximum sequence length: 2049, sample length: 2241 +[default0]:Skipping sample id=930155. Maximum sequence length: 2049, sample length: 2406 +[default0]:Skipping sample id=631566. Maximum sequence length: 2049, sample length: 2423 +[default0]:Skipping sample id=1235134. Maximum sequence length: 2049, sample length: 4341 +[default0]:Skipping sample id=1324384. Maximum sequence length: 2049, sample length: 3837 +[default0]:Skipping sample id=299307. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=1249441. Maximum sequence length: 2049, sample length: 2211 +[default0]:Skipping sample id=842016. Maximum sequence length: 2049, sample length: 2294 +[default0]:Skipping sample id=793573. Maximum sequence length: 2049, sample length: 2051 +[default0]:Skipping sample id=577724. Maximum sequence length: 2049, sample length: 2959 +[default0]:Skipping sample id=267063. Maximum sequence length: 2049, sample length: 3491 +[default0]:Skipping sample id=1132799. Maximum sequence length: 2049, sample length: 3034 +[default0]:Skipping sample id=213350. Maximum sequence length: 2049, sample length: 2675 +[default0]:Skipping sample id=32435. Maximum sequence length: 2049, sample length: 2388 +[default0]:Skipping sample id=989912. Maximum sequence length: 2049, sample length: 2167 +[default0]:Skipping sample id=640934. Maximum sequence length: 2049, sample length: 2296 +[default0]:Skipping sample id=922442. Maximum sequence length: 2049, sample length: 4341 +[default0]:Skipping sample id=989528. Maximum sequence length: 2049, sample length: 3616 +[default0]:Skipping sample id=396946. Maximum sequence length: 2049, sample length: 2776 +[default0]:Skipping sample id=211512. Maximum sequence length: 2049, sample length: 3357 +[default0]:Skipping sample id=1074109. Maximum sequence length: 2049, sample length: 2335 +[default0]:Skipping sample id=880394. Maximum sequence length: 2049, sample length: 3074 +[default0]:Skipping sample id=238223. Maximum sequence length: 2049, sample length: 3953 +[default0]:Skipping sample id=691656. Maximum sequence length: 2049, sample length: 2230 +[default0]:Skipping sample id=123390. Maximum sequence length: 2049, sample length: 3962 +[default0]:Skipping sample id=1245699. Maximum sequence length: 2049, sample length: 2557 +[default0]:Skipping sample id=1098760. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=1180861. Maximum sequence length: 2049, sample length: 5191 +[default0]:Skipping sample id=781047. Maximum sequence length: 2049, sample length: 2574 +[default0]:Skipping sample id=1251783. Maximum sequence length: 2049, sample length: 2210 +[default0]:Skipping sample id=495005. Maximum sequence length: 2049, sample length: 3111 +[default0]:Skipping sample id=1467275. Maximum sequence length: 2049, sample length: 4958 +[default0]:Skipping sample id=1176842. Maximum sequence length: 2049, sample length: 2337 +[default0]:Skipping sample id=1505148. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=908992. Maximum sequence length: 2049, sample length: 2829 +[default0]:Skipping sample id=771811. Maximum sequence length: 2049, sample length: 3647 +[default0]:Skipping sample id=1377167. Maximum sequence length: 2049, sample length: 2508 +[default0]:Skipping sample id=770413. Maximum sequence length: 2049, sample length: 2834 +[default0]:Skipping sample id=1305423. Maximum sequence length: 2049, sample length: 2087 +[default0]:Skipping sample id=369444. Maximum sequence length: 2049, sample length: 2942 +[default0]:Skipping sample id=1247023. Maximum sequence length: 2049, sample length: 2382 +[default0]:Skipping sample id=1547366. Maximum sequence length: 2049, sample length: 2626 +[default0]:Skipping sample id=467779. Maximum sequence length: 2049, sample length: 2163 +[default0]:Skipping sample id=499285. Maximum sequence length: 2049, sample length: 2323 +[default0]:Skipping sample id=62664. Maximum sequence length: 2049, sample length: 3970 +[default0]:Skipping sample id=100198. Maximum sequence length: 2049, sample length: 2072 +[default0]:Skipping sample id=670372. Maximum sequence length: 2049, sample length: 2195 +[default0]:Skipping sample id=1178814. Maximum sequence length: 2049, sample length: 2370 +[default0]:Skipping sample id=1269775. Maximum sequence length: 2049, sample length: 2635 +[default0]:Skipping sample id=1518568. Maximum sequence length: 2049, sample length: 3965 +[default0]:Skipping sample id=117047. Maximum sequence length: 2049, sample length: 2122 +[default0]:Skipping sample id=1530580. Maximum sequence length: 2049, sample length: 4343 +[default0]:Skipping sample id=340753. Maximum sequence length: 2049, sample length: 3004 +[default0]:Skipping sample id=709315. Maximum sequence length: 2049, sample length: 3403 +[default0]:Skipping sample id=312674. Maximum sequence length: 2049, sample length: 3418 +[default0]:Skipping sample id=217754. Maximum sequence length: 2049, sample length: 4454 +[default0]:Skipping sample id=929105. Maximum sequence length: 2049, sample length: 2719 +[default0]:Skipping sample id=321959. Maximum sequence length: 2049, sample length: 2681 +[default0]:Skipping sample id=1363326. Maximum sequence length: 2049, sample length: 3522 +[default0]:Skipping sample id=802014. Maximum sequence length: 2049, sample length: 4952 +[default0]:Skipping sample id=1082005. Maximum sequence length: 2049, sample length: 2153 +[default0]:Skipping sample id=432749. Maximum sequence length: 2049, sample length: 2254 +[default0]:Skipping sample id=1182273. Maximum sequence length: 2049, sample length: 2342 +[default0]:Skipping sample id=978252. Maximum sequence length: 2049, sample length: 4516 +[default0]:Skipping sample id=1301667. Maximum sequence length: 2049, sample length: 3697 +[default0]:Skipping sample id=715445. Maximum sequence length: 2049, sample length: 2383 +[default0]:Skipping sample id=1353064. Maximum sequence length: 2049, sample length: 2540 +[default0]:Skipping sample id=102064. Maximum sequence length: 2049, sample length: 3201 +[default0]:Skipping sample id=709347. Maximum sequence length: 2049, sample length: 2694 +[default0]:Skipping sample id=496411. Maximum sequence length: 2049, sample length: 3181 +[default0]:Skipping sample id=1221340. Maximum sequence length: 2049, sample length: 3528 +[default0]:Skipping sample id=512693. Maximum sequence length: 2049, sample length: 2731 +[default0]:Skipping sample id=1050680. Maximum sequence length: 2049, sample length: 4819 +[default0]:Skipping sample id=489100. Maximum sequence length: 2049, sample length: 3652 +[default0]:Skipping sample id=1361579. Maximum sequence length: 2049, sample length: 2728 +[default0]:Skipping sample id=1012055. Maximum sequence length: 2049, sample length: 4639 +[default0]:Skipping sample id=1194707. Maximum sequence length: 2049, sample length: 2232 +[default0]:Skipping sample id=966803. Maximum sequence length: 2049, sample length: 3338 +[default0]:Skipping sample id=158196. Maximum sequence length: 2049, sample length: 2504 +[default0]:Skipping sample id=351865. Maximum sequence length: 2049, sample length: 2411 +[default0]:Skipping sample id=1219809. Maximum sequence length: 2049, sample length: 2642 +[default0]:Skipping sample id=233342. Maximum sequence length: 2049, sample length: 2795 +[default0]:Skipping sample id=725082. Maximum sequence length: 2049, sample length: 3211 +[default0]:Skipping sample id=1516847. Maximum sequence length: 2049, sample length: 2205 +[default0]:Skipping sample id=1272901. Maximum sequence length: 2049, sample length: 2848 +[default0]:Skipping sample id=1472530. Maximum sequence length: 2049, sample length: 3006 +[default0]:Skipping sample id=763667. Maximum sequence length: 2049, sample length: 2228 +[default0]:Skipping sample id=394493. Maximum sequence length: 2049, sample length: 2806 +[default0]:Skipping sample id=700392. Maximum sequence length: 2049, sample length: 3449 +[default0]:Skipping sample id=456377. Maximum sequence length: 2049, sample length: 3189 +[default0]:Skipping sample id=1026291. Maximum sequence length: 2049, sample length: 2732 +[default0]:Skipping sample id=990488. Maximum sequence length: 2049, sample length: 4483 +[default0]:Skipping sample id=304548. Maximum sequence length: 2049, sample length: 2149 +[default0]:Skipping sample id=1505593. Maximum sequence length: 2049, sample length: 2232 +[default0]:Skipping sample id=205937. Maximum sequence length: 2049, sample length: 2917 +[default0]:Skipping sample id=762677. Maximum sequence length: 2049, sample length: 2707 +[default0]:Skipping sample id=1475658. Maximum sequence length: 2049, sample length: 2205 +[default0]:Skipping sample id=249108. Maximum sequence length: 2049, sample length: 2973 +[default0]:Skipping sample id=740232. Maximum sequence length: 2049, sample length: 6108 +[default0]:Skipping sample id=1505968. Maximum sequence length: 2049, sample length: 2295 +[default0]:Skipping sample id=935935. Maximum sequence length: 2049, sample length: 2841 +[default0]:Skipping sample id=949334. Maximum sequence length: 2049, sample length: 3098 +[default0]:Skipping sample id=20986. Maximum sequence length: 2049, sample length: 3132 +[default0]:Skipping sample id=148078. Maximum sequence length: 2049, sample length: 3555 +[default0]:Skipping sample id=800760. Maximum sequence length: 2049, sample length: 3070 +[default0]:Skipping sample id=1495985. Maximum sequence length: 2049, sample length: 2613 +[default0]:Skipping sample id=594352. Maximum sequence length: 2049, sample length: 2184 +[default0]:Skipping sample id=1365506. Maximum sequence length: 2049, sample length: 2156 +[default0]:Skipping sample id=811998. Maximum sequence length: 2049, sample length: 3679 +[default0]:Skipping sample id=667336. Maximum sequence length: 2049, sample length: 2705 +[default0]:Skipping sample id=417874. Maximum sequence length: 2049, sample length: 2575 +[default0]:Skipping sample id=133885. Maximum sequence length: 2049, sample length: 3305 +[default0]:Skipping sample id=788404. Maximum sequence length: 2049, sample length: 2098 +[default0]:Skipping sample id=396527. Maximum sequence length: 2049, sample length: 2449 +[default0]:Skipping sample id=1432785. Maximum sequence length: 2049, sample length: 2142 +[default0]:Skipping sample id=147017. Maximum sequence length: 2049, sample length: 2345 +[default0]:Skipping sample id=1486618. Maximum sequence length: 2049, sample length: 2798 +[default0]:Skipping sample id=69480. Maximum sequence length: 2049, sample length: 2920 +[default0]:Skipping sample id=492674. Maximum sequence length: 2049, sample length: 3878 +[default0]:Skipping sample id=664858. Maximum sequence length: 2049, sample length: 4509 +[default0]:Skipping sample id=1300274. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=920113. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=1457429. Maximum sequence length: 2049, sample length: 2119 +[default0]:Skipping sample id=58679. Maximum sequence length: 2049, sample length: 2335 +[default0]:Skipping sample id=505098. Maximum sequence length: 2049, sample length: 4208 +[default0]:Skipping sample id=574138. Maximum sequence length: 2049, sample length: 3090 +[default0]:Skipping sample id=1100994. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=1228745. Maximum sequence length: 2049, sample length: 2564 +[default0]:Skipping sample id=1440137. Maximum sequence length: 2049, sample length: 3763 +[default0]:Skipping sample id=1459925. Maximum sequence length: 2049, sample length: 2522 +[default0]:Skipping sample id=229589. Maximum sequence length: 2049, sample length: 2614 +[default0]:Skipping sample id=681929. Maximum sequence length: 2049, sample length: 4303 +[default0]:Skipping sample id=1382228. Maximum sequence length: 2049, sample length: 2183 +[default0]:Skipping sample id=1435195. Maximum sequence length: 2049, sample length: 3377 +[default0]:Skipping sample id=775906. Maximum sequence length: 2049, sample length: 4450 +[default0]:Skipping sample id=1514622. Maximum sequence length: 2049, sample length: 2312 +[default0]:Skipping sample id=13248. Maximum sequence length: 2049, sample length: 2056 +[default0]:Skipping sample id=940123. Maximum sequence length: 2049, sample length: 2496 +[default0]:Skipping sample id=1475495. Maximum sequence length: 2049, sample length: 2709 +[default0]:Skipping sample id=1044543. Maximum sequence length: 2049, sample length: 2079 +[default0]:Skipping sample id=980723. Maximum sequence length: 2049, sample length: 2660 +[default0]:Skipping sample id=1435388. Maximum sequence length: 2049, sample length: 2467 +[default0]:Skipping sample id=338140. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=1476317. Maximum sequence length: 2049, sample length: 2448 +[default0]:Skipping sample id=695347. Maximum sequence length: 2049, sample length: 3279 +[default0]:Skipping sample id=384415. Maximum sequence length: 2049, sample length: 3129 +[default0]:Skipping sample id=1394897. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=438220. Maximum sequence length: 2049, sample length: 2113 +[default0]:Skipping sample id=1088306. Maximum sequence length: 2049, sample length: 2253 +[default0]:Skipping sample id=464166. Maximum sequence length: 2049, sample length: 2796 +[default0]:Skipping sample id=708121. Maximum sequence length: 2049, sample length: 3260 +[default0]:Skipping sample id=853994. Maximum sequence length: 2049, sample length: 2703 +[default0]:Skipping sample id=192633. Maximum sequence length: 2049, sample length: 2759 +[default0]:Skipping sample id=1025495. Maximum sequence length: 2049, sample length: 2308 +[default0]:Skipping sample id=936416. Maximum sequence length: 2049, sample length: 2620 +[default0]:Skipping sample id=101727. Maximum sequence length: 2049, sample length: 2087 +[default0]:Skipping sample id=406391. Maximum sequence length: 2049, sample length: 2384 +[default0]:Skipping sample id=50233. Maximum sequence length: 2049, sample length: 2833 +[default0]:Skipping sample id=1520179. Maximum sequence length: 2049, sample length: 2451 +[default0]:Skipping sample id=1395037. Maximum sequence length: 2049, sample length: 2662 +[default0]:Skipping sample id=1271541. Maximum sequence length: 2049, sample length: 2407 +[default0]:Skipping sample id=1035441. Maximum sequence length: 2049, sample length: 2688 +[default0]:Skipping sample id=76154. Maximum sequence length: 2049, sample length: 3466 +[default0]:Skipping sample id=847585. Maximum sequence length: 2049, sample length: 2975 +[default0]:Skipping sample id=1161084. Maximum sequence length: 2049, sample length: 2140 +[default0]:Skipping sample id=74966. Maximum sequence length: 2049, sample length: 2247 +[default0]:Skipping sample id=1495770. Maximum sequence length: 2049, sample length: 3025 +[default0]:Skipping sample id=1172403. Maximum sequence length: 2049, sample length: 3312 +[default0]:Skipping sample id=1476042. Maximum sequence length: 2049, sample length: 2237 +[default0]:Skipping sample id=330030. Maximum sequence length: 2049, sample length: 4641 +[default0]:Skipping sample id=1555279. Maximum sequence length: 2049, sample length: 2724 +[default0]:Skipping sample id=1569117. Maximum sequence length: 2049, sample length: 3194 +[default0]:Skipping sample id=445693. Maximum sequence length: 2049, sample length: 2184 +[default0]:Skipping sample id=1267623. Maximum sequence length: 2049, sample length: 2965 +[default0]:Skipping sample id=1130743. Maximum sequence length: 2049, sample length: 2104 +[default0]:Skipping sample id=1077925. Maximum sequence length: 2049, sample length: 2212 +[default0]:Skipping sample id=1542074. Maximum sequence length: 2049, sample length: 4644 +[default0]:Skipping sample id=691112. Maximum sequence length: 2049, sample length: 2929 +[default0]:Skipping sample id=342213. Maximum sequence length: 2049, sample length: 5555 +[default0]:Skipping sample id=1357291. Maximum sequence length: 2049, sample length: 2975 +[default0]:Skipping sample id=1210180. Maximum sequence length: 2049, sample length: 2655 +[default0]:Skipping sample id=851544. Maximum sequence length: 2049, sample length: 3161 +[default0]:Skipping sample id=686537. Maximum sequence length: 2049, sample length: 2067 +[default0]:Skipping sample id=825840. Maximum sequence length: 2049, sample length: 3295 +[default0]:Skipping sample id=1206996. Maximum sequence length: 2049, sample length: 4696 +[default0]:Skipping sample id=544527. Maximum sequence length: 2049, sample length: 2181 +[default0]:Skipping sample id=1267258. Maximum sequence length: 2049, sample length: 2312 +[default0]:Skipping sample id=943295. Maximum sequence length: 2049, sample length: 3098 +[default0]:Skipping sample id=1546479. Maximum sequence length: 2049, sample length: 3353 +[default0]:Skipping sample id=1552204. Maximum sequence length: 2049, sample length: 3377 +[default0]:Skipping sample id=1258831. Maximum sequence length: 2049, sample length: 2282 +[default0]:Skipping sample id=814894. Maximum sequence length: 2049, sample length: 2900 +[default0]:Skipping sample id=494147. Maximum sequence length: 2049, sample length: 3228 +[default0]:Skipping sample id=1358996. Maximum sequence length: 2049, sample length: 3003 +[default0]:Skipping sample id=503448. Maximum sequence length: 2049, sample length: 2539 +[default0]:Skipping sample id=989745. Maximum sequence length: 2049, sample length: 2328 +[default0]:Skipping sample id=1212885. Maximum sequence length: 2049, sample length: 3537 +[default0]:Skipping sample id=1225401. Maximum sequence length: 2049, sample length: 2712 +[default0]:Skipping sample id=1405179. Maximum sequence length: 2049, sample length: 3126 +[default0]:Skipping sample id=1451333. Maximum sequence length: 2049, sample length: 2528 +[default0]:Skipping sample id=384120. Maximum sequence length: 2049, sample length: 2286 +[default0]:Skipping sample id=547870. Maximum sequence length: 2049, sample length: 2559 +[default0]:Skipping sample id=870394. Maximum sequence length: 2049, sample length: 2852 +[default0]:Skipping sample id=1269226. Maximum sequence length: 2049, sample length: 5295 +[default0]:Skipping sample id=71288. Maximum sequence length: 2049, sample length: 2417 +[default0]:Skipping sample id=1496677. Maximum sequence length: 2049, sample length: 2809 +[default0]:Skipping sample id=1028691. Maximum sequence length: 2049, sample length: 2127 +[default0]:Skipping sample id=1318536. Maximum sequence length: 2049, sample length: 4907 +[default0]:Skipping sample id=893630. Maximum sequence length: 2049, sample length: 2272 +[default0]:Skipping sample id=1245363. Maximum sequence length: 2049, sample length: 2651 +[default0]:Skipping sample id=725136. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=463355. Maximum sequence length: 2049, sample length: 2283 +[default0]:Skipping sample id=822160. Maximum sequence length: 2049, sample length: 2728 +[default0]:Skipping sample id=960291. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=1366922. Maximum sequence length: 2049, sample length: 2639 +[default0]:Skipping sample id=1036719. Maximum sequence length: 2049, sample length: 4163 +[default0]:Skipping sample id=1251171. Maximum sequence length: 2049, sample length: 4003 +[default0]:Skipping sample id=1090028. Maximum sequence length: 2049, sample length: 2331 +[default0]:Skipping sample id=92726. Maximum sequence length: 2049, sample length: 2482 +[default0]:Skipping sample id=350075. Maximum sequence length: 2049, sample length: 3588 +[default0]:Skipping sample id=420161. Maximum sequence length: 2049, sample length: 2122 +[default0]:Skipping sample id=809849. Maximum sequence length: 2049, sample length: 2280 +[default0]:Skipping sample id=450050. Maximum sequence length: 2049, sample length: 3379 +[default0]:Skipping sample id=74896. Maximum sequence length: 2049, sample length: 3444 +[default0]:Skipping sample id=697412. Maximum sequence length: 2049, sample length: 2961 +[default0]:Skipping sample id=783119. Maximum sequence length: 2049, sample length: 4230 +[default0]:Skipping sample id=762642. Maximum sequence length: 2049, sample length: 2751 +[default0]:Skipping sample id=602860. Maximum sequence length: 2049, sample length: 2526 +[default0]:Skipping sample id=916807. Maximum sequence length: 2049, sample length: 2077 +[default0]:Skipping sample id=681257. Maximum sequence length: 2049, sample length: 4150 +[default0]:Skipping sample id=1543964. Maximum sequence length: 2049, sample length: 2750 +[default0]:Skipping sample id=784161. Maximum sequence length: 2049, sample length: 3210 +[default0]:Skipping sample id=458928. Maximum sequence length: 2049, sample length: 2865 +[default0]:Skipping sample id=1438727. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=720952. Maximum sequence length: 2049, sample length: 2813 +[default0]:Skipping sample id=264727. Maximum sequence length: 2049, sample length: 5236 +[default0]:Skipping sample id=765818. Maximum sequence length: 2049, sample length: 2492 +[default0]:Skipping sample id=603468. Maximum sequence length: 2049, sample length: 2776 +[default0]:Skipping sample id=76049. Maximum sequence length: 2049, sample length: 4129 +[default0]:Skipping sample id=620050. Maximum sequence length: 2049, sample length: 2365 +[default0]:Skipping sample id=1079298. Maximum sequence length: 2049, sample length: 2069 +[default0]:Skipping sample id=362270. Maximum sequence length: 2049, sample length: 2124 +[default0]:Skipping sample id=640674. Maximum sequence length: 2049, sample length: 2896 +[default0]:Skipping sample id=225865. Maximum sequence length: 2049, sample length: 4518 +[default0]:Skipping sample id=460267. Maximum sequence length: 2049, sample length: 2893 +[default0]:Skipping sample id=609303. Maximum sequence length: 2049, sample length: 3903 +[default0]:Skipping sample id=998195. Maximum sequence length: 2049, sample length: 4660 +[default0]:Skipping sample id=921785. Maximum sequence length: 2049, sample length: 2835 +[default0]:Skipping sample id=957341. Maximum sequence length: 2049, sample length: 3445 +[default0]:Skipping sample id=559138. Maximum sequence length: 2049, sample length: 2782 +[default0]:Skipping sample id=436281. Maximum sequence length: 2049, sample length: 3300 +[default0]:Skipping sample id=1486652. Maximum sequence length: 2049, sample length: 2384 +[default0]:Skipping sample id=627235. Maximum sequence length: 2049, sample length: 2991 +[default0]:Skipping sample id=569818. Maximum sequence length: 2049, sample length: 3017 +[default0]:Skipping sample id=1522096. Maximum sequence length: 2049, sample length: 4124 +[default0]:Skipping sample id=60690. Maximum sequence length: 2049, sample length: 2533 +[default0]:Skipping sample id=150690. Maximum sequence length: 2049, sample length: 2999 +[default0]:Skipping sample id=713941. Maximum sequence length: 2049, sample length: 2943 +[default0]:Skipping sample id=1141294. Maximum sequence length: 2049, sample length: 3475 +[default0]:Skipping sample id=260019. Maximum sequence length: 2049, sample length: 2138 +[default0]:Skipping sample id=941325. Maximum sequence length: 2049, sample length: 2290 +[default0]:Skipping sample id=1240482. Maximum sequence length: 2049, sample length: 2964 +[default0]:Skipping sample id=396616. Maximum sequence length: 2049, sample length: 3661 +[default0]:Skipping sample id=520417. Maximum sequence length: 2049, sample length: 2402 +[default0]:Skipping sample id=1564478. Maximum sequence length: 2049, sample length: 2177 +[default0]:Skipping sample id=547535. Maximum sequence length: 2049, sample length: 2238 +[default0]:Skipping sample id=1375749. Maximum sequence length: 2049, sample length: 4635 +[default0]:Skipping sample id=1339666. Maximum sequence length: 2049, sample length: 2164 +[default0]:Skipping sample id=418987. Maximum sequence length: 2049, sample length: 2226 +[default0]:Skipping sample id=1288096. Maximum sequence length: 2049, sample length: 3235 +[default0]:Skipping sample id=1170835. Maximum sequence length: 2049, sample length: 2266 +[default0]:Skipping sample id=547846. Maximum sequence length: 2049, sample length: 2376 +[default0]:Skipping sample id=1473836. Maximum sequence length: 2049, sample length: 2708 +[default0]:Skipping sample id=503928. Maximum sequence length: 2049, sample length: 3699 +[default0]:Skipping sample id=460532. Maximum sequence length: 2049, sample length: 2896 +[default0]:Skipping sample id=849737. Maximum sequence length: 2049, sample length: 4356 +[default0]:Skipping sample id=809606. Maximum sequence length: 2049, sample length: 2107 +[default0]:Skipping sample id=183844. Maximum sequence length: 2049, sample length: 2820 +[default0]:Skipping sample id=1432190. Maximum sequence length: 2049, sample length: 2709 +[default0]:Skipping sample id=1360741. Maximum sequence length: 2049, sample length: 2124 +[default0]:Skipping sample id=939513. Maximum sequence length: 2049, sample length: 2155 +[default0]:Skipping sample id=905951. Maximum sequence length: 2049, sample length: 2692 +[default0]:Skipping sample id=434345. Maximum sequence length: 2049, sample length: 3931 +[default0]:Skipping sample id=1157433. Maximum sequence length: 2049, sample length: 2457 +[default0]:Skipping sample id=1066456. Maximum sequence length: 2049, sample length: 3097 +[default0]:Skipping sample id=855408. Maximum sequence length: 2049, sample length: 2729 +[default0]:Skipping sample id=1310582. Maximum sequence length: 2049, sample length: 4653 +[default0]:Skipping sample id=799701. Maximum sequence length: 2049, sample length: 3920 +[default0]:Skipping sample id=795621. Maximum sequence length: 2049, sample length: 2548 +[default0]:Skipping sample id=1351206. Maximum sequence length: 2049, sample length: 2223 +[default0]:Skipping sample id=398438. Maximum sequence length: 2049, sample length: 3279 +[default0]:Skipping sample id=186409. Maximum sequence length: 2049, sample length: 2612 +[default0]:Skipping sample id=873507. Maximum sequence length: 2049, sample length: 3408 +[default0]:Skipping sample id=91736. Maximum sequence length: 2049, sample length: 2623 +[default0]:Skipping sample id=160540. Maximum sequence length: 2049, sample length: 2296 +[default0]:Skipping sample id=768403. Maximum sequence length: 2049, sample length: 3207 +[default0]:Skipping sample id=482247. Maximum sequence length: 2049, sample length: 3609 +[default0]:Skipping sample id=1388568. Maximum sequence length: 2049, sample length: 5074 +[default0]:Skipping sample id=89113. Maximum sequence length: 2049, sample length: 2089 +[default0]:Skipping sample id=174570. Maximum sequence length: 2049, sample length: 2485 +[default0]:Skipping sample id=293541. Maximum sequence length: 2049, sample length: 2371 +[default0]:Skipping sample id=324789. Maximum sequence length: 2049, sample length: 3554 +[default0]:Skipping sample id=1066900. Maximum sequence length: 2049, sample length: 2419 +[default0]:Skipping sample id=1294391. Maximum sequence length: 2049, sample length: 6024 +[default0]:Skipping sample id=1501010. Maximum sequence length: 2049, sample length: 2706 +[default0]:Skipping sample id=266373. Maximum sequence length: 2049, sample length: 2521 +[default0]:Skipping sample id=1532880. Maximum sequence length: 2049, sample length: 3264 +[default0]:Skipping sample id=105779. Maximum sequence length: 2049, sample length: 3612 +[default0]:Skipping sample id=1305657. Maximum sequence length: 2049, sample length: 2313 +[default0]:Skipping sample id=1384948. Maximum sequence length: 2049, sample length: 2954 +[default0]:Skipping sample id=1313992. Maximum sequence length: 2049, sample length: 3341 +[default0]:Skipping sample id=1239755. Maximum sequence length: 2049, sample length: 2565 +[default0]:Skipping sample id=505097. Maximum sequence length: 2049, sample length: 2903 +[default0]:Skipping sample id=167683. Maximum sequence length: 2049, sample length: 2347 +[default0]:Skipping sample id=1108063. Maximum sequence length: 2049, sample length: 3036 +[default0]:Skipping sample id=1349570. Maximum sequence length: 2049, sample length: 3589 +[default0]:Skipping sample id=1259718. Maximum sequence length: 2049, sample length: 2427 +[default0]:Skipping sample id=891871. Maximum sequence length: 2049, sample length: 2654 +[default0]:Skipping sample id=1038487. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=51492. Maximum sequence length: 2049, sample length: 3637 +[default0]:Skipping sample id=459181. Maximum sequence length: 2049, sample length: 2399 +[default0]:Skipping sample id=856227. Maximum sequence length: 2049, sample length: 2828 +[default0]:Skipping sample id=476442. Maximum sequence length: 2049, sample length: 5731 +[default0]:Skipping sample id=3913. Maximum sequence length: 2049, sample length: 6946 +[default0]:Skipping sample id=1269042. Maximum sequence length: 2049, sample length: 3818 +[default0]:Skipping sample id=1054948. Maximum sequence length: 2049, sample length: 4400 +[default0]:Skipping sample id=20513. Maximum sequence length: 2049, sample length: 2099 +[default0]:Skipping sample id=565964. Maximum sequence length: 2049, sample length: 2848 +[default0]:Skipping sample id=1213959. Maximum sequence length: 2049, sample length: 2494 +[default0]:Skipping sample id=16573. Maximum sequence length: 2049, sample length: 4017 +[default0]:Skipping sample id=112330. Maximum sequence length: 2049, sample length: 4928 +[default0]:Skipping sample id=430867. Maximum sequence length: 2049, sample length: 2368 +[default0]:Skipping sample id=1239061. Maximum sequence length: 2049, sample length: 2261 +[default0]:Skipping sample id=354462. Maximum sequence length: 2049, sample length: 2147 +[default0]:Skipping sample id=1228792. Maximum sequence length: 2049, sample length: 2305 +[default0]:Skipping sample id=172042. Maximum sequence length: 2049, sample length: 2182 +[default0]:Skipping sample id=1024422. Maximum sequence length: 2049, sample length: 2662 +[default0]:Skipping sample id=618809. Maximum sequence length: 2049, sample length: 7248 +[default0]:Skipping sample id=988279. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=952358. Maximum sequence length: 2049, sample length: 4215 +[default0]:Skipping sample id=1009164. Maximum sequence length: 2049, sample length: 3241 +[default0]:Skipping sample id=9490. Maximum sequence length: 2049, sample length: 2649 +[default0]:Skipping sample id=1099459. Maximum sequence length: 2049, sample length: 3473 +[default0]:Skipping sample id=145712. Maximum sequence length: 2049, sample length: 2330 +[default0]:Skipping sample id=579147. Maximum sequence length: 2049, sample length: 2564 +[default0]:Skipping sample id=335070. Maximum sequence length: 2049, sample length: 3254 +[default0]:Skipping sample id=1020481. Maximum sequence length: 2049, sample length: 3130 +[default0]:Skipping sample id=23787. Maximum sequence length: 2049, sample length: 2949 +[default0]:Skipping sample id=967654. Maximum sequence length: 2049, sample length: 3219 +[default0]:Skipping sample id=733616. Maximum sequence length: 2049, sample length: 3023 +[default0]:Skipping sample id=444887. Maximum sequence length: 2049, sample length: 3435 +[default0]:Skipping sample id=1301915. Maximum sequence length: 2049, sample length: 2312 +[default0]:Skipping sample id=1440424. Maximum sequence length: 2049, sample length: 2768 +[default0]:Skipping sample id=542819. Maximum sequence length: 2049, sample length: 3658 +[default0]:Skipping sample id=118938. Maximum sequence length: 2049, sample length: 2142 +[default0]:Skipping sample id=1466212. Maximum sequence length: 2049, sample length: 2234 +[default0]:Skipping sample id=475391. Maximum sequence length: 2049, sample length: 2740 +[default0]:Skipping sample id=1499950. Maximum sequence length: 2049, sample length: 3058 +[default0]:Skipping sample id=355266. Maximum sequence length: 2049, sample length: 3380 +[default0]:Skipping sample id=1282350. Maximum sequence length: 2049, sample length: 4271 +[default0]:Skipping sample id=332722. Maximum sequence length: 2049, sample length: 2244 +[default0]:Skipping sample id=1508475. Maximum sequence length: 2049, sample length: 3024 +[default0]:Skipping sample id=130322. Maximum sequence length: 2049, sample length: 4013 +[default0]:Skipping sample id=286442. Maximum sequence length: 2049, sample length: 3742 +[default0]:Skipping sample id=85562. Maximum sequence length: 2049, sample length: 3812 +[default0]:Skipping sample id=570825. Maximum sequence length: 2049, sample length: 2286 +[default0]:Skipping sample id=1122849. Maximum sequence length: 2049, sample length: 2417 +[default0]:Skipping sample id=815767. Maximum sequence length: 2049, sample length: 3004 +[default0]:Skipping sample id=1334528. Maximum sequence length: 2049, sample length: 2120 +[default0]:Skipping sample id=108757. Maximum sequence length: 2049, sample length: 3372 +[default0]:Skipping sample id=534082. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=1140003. Maximum sequence length: 2049, sample length: 2933 +[default0]:Skipping sample id=72583. Maximum sequence length: 2049, sample length: 2409 +[default0]:Skipping sample id=174222. Maximum sequence length: 2049, sample length: 3951 +[default0]:Skipping sample id=1516818. Maximum sequence length: 2049, sample length: 2405 +[default0]:Skipping sample id=1161088. Maximum sequence length: 2049, sample length: 3703 +[default0]:Skipping sample id=540478. Maximum sequence length: 2049, sample length: 3977 +[default0]:Skipping sample id=17358. Maximum sequence length: 2049, sample length: 3199 +[default0]:Skipping sample id=368575. Maximum sequence length: 2049, sample length: 3663 +[default0]:Skipping sample id=1099843. Maximum sequence length: 2049, sample length: 2658 +[default0]:Skipping sample id=525044. Maximum sequence length: 2049, sample length: 3711 +[default0]:Skipping sample id=1111737. Maximum sequence length: 2049, sample length: 2762 +[default0]:Skipping sample id=210701. Maximum sequence length: 2049, sample length: 4313 +[default0]:Skipping sample id=113733. Maximum sequence length: 2049, sample length: 5763 +[default0]:Skipping sample id=419045. Maximum sequence length: 2049, sample length: 3222 +[default0]:Skipping sample id=1384318. Maximum sequence length: 2049, sample length: 2184 +[default0]:Skipping sample id=309486. Maximum sequence length: 2049, sample length: 2811 +[default0]:Skipping sample id=739818. Maximum sequence length: 2049, sample length: 3141 +[default0]:Skipping sample id=1484625. Maximum sequence length: 2049, sample length: 3060 +[default0]:Skipping sample id=1195464. Maximum sequence length: 2049, sample length: 2789 +[default0]:Skipping sample id=607333. Maximum sequence length: 2049, sample length: 2231 +[default0]:Skipping sample id=877274. Maximum sequence length: 2049, sample length: 2206 +[default0]:Skipping sample id=597598. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=343439. Maximum sequence length: 2049, sample length: 2091 +[default0]:Skipping sample id=121585. Maximum sequence length: 2049, sample length: 2234 +[default0]:Skipping sample id=1153051. Maximum sequence length: 2049, sample length: 2228 +[default0]:Skipping sample id=654201. Maximum sequence length: 2049, sample length: 2467 +[default0]:Skipping sample id=1534725. Maximum sequence length: 2049, sample length: 2643 +[default0]:Skipping sample id=1530024. Maximum sequence length: 2049, sample length: 2605 +[default0]:Skipping sample id=979233. Maximum sequence length: 2049, sample length: 3150 +[default0]:Skipping sample id=1524296. Maximum sequence length: 2049, sample length: 2262 +[default0]:Skipping sample id=1378106. Maximum sequence length: 2049, sample length: 4198 +[default0]:Skipping sample id=644512. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=747798. Maximum sequence length: 2049, sample length: 2663 +[default0]:Skipping sample id=709633. Maximum sequence length: 2049, sample length: 2517 +[default0]:Skipping sample id=925293. Maximum sequence length: 2049, sample length: 2628 +[default0]:Skipping sample id=599321. Maximum sequence length: 2049, sample length: 2636 +[default0]:Skipping sample id=1435646. Maximum sequence length: 2049, sample length: 2516 +[default0]:Skipping sample id=1336329. Maximum sequence length: 2049, sample length: 3372 +[default0]:Skipping sample id=908793. Maximum sequence length: 2049, sample length: 3460 +[default0]:Skipping sample id=1351156. Maximum sequence length: 2049, sample length: 2589 +[default0]:Skipping sample id=798886. Maximum sequence length: 2049, sample length: 3106 +[default0]:Skipping sample id=252872. Maximum sequence length: 2049, sample length: 3285 +[default0]:Skipping sample id=180602. Maximum sequence length: 2049, sample length: 2055 +[default0]:Skipping sample id=1367357. Maximum sequence length: 2049, sample length: 2418 +[default0]:Skipping sample id=546226. Maximum sequence length: 2049, sample length: 2518 +[default0]:Skipping sample id=986115. Maximum sequence length: 2049, sample length: 3110 +[default0]:Skipping sample id=1181211. Maximum sequence length: 2049, sample length: 2140 +[default0]:Skipping sample id=123511. Maximum sequence length: 2049, sample length: 5557 +[default0]:Skipping sample id=38015. Maximum sequence length: 2049, sample length: 2683 +[default0]:Skipping sample id=1070916. Maximum sequence length: 2049, sample length: 2105 +[default0]:Skipping sample id=131240. Maximum sequence length: 2049, sample length: 3613 +[default0]:Skipping sample id=1553114. Maximum sequence length: 2049, sample length: 2590 +[default0]:Skipping sample id=1310417. Maximum sequence length: 2049, sample length: 4762 +[default0]:Skipping sample id=569035. Maximum sequence length: 2049, sample length: 3129 +[default0]:Skipping sample id=486529. Maximum sequence length: 2049, sample length: 2147 +[default0]:Skipping sample id=1136884. Maximum sequence length: 2049, sample length: 2349 +[default0]:Skipping sample id=432999. Maximum sequence length: 2049, sample length: 2768 +[default0]:Skipping sample id=1265094. Maximum sequence length: 2049, sample length: 2629 +[default0]:Skipping sample id=24158. Maximum sequence length: 2049, sample length: 4737 +[default0]:Skipping sample id=1507242. Maximum sequence length: 2049, sample length: 2190 +[default0]:Skipping sample id=41864. Maximum sequence length: 2049, sample length: 3737 +[default0]:Skipping sample id=681749. Maximum sequence length: 2049, sample length: 3007 +[default0]:Skipping sample id=832739. Maximum sequence length: 2049, sample length: 3797 +[default0]:Skipping sample id=690379. Maximum sequence length: 2049, sample length: 2301 +[default0]:Skipping sample id=283636. Maximum sequence length: 2049, sample length: 2272 +[default0]:Skipping sample id=89121. Maximum sequence length: 2049, sample length: 2570 +[default0]:Skipping sample id=1012132. Maximum sequence length: 2049, sample length: 3001 +[default0]:Skipping sample id=784054. Maximum sequence length: 2049, sample length: 4595 +[default0]:Skipping sample id=1191804. Maximum sequence length: 2049, sample length: 2533 +[default0]:Skipping sample id=1002040. Maximum sequence length: 2049, sample length: 2195 +[default0]:Skipping sample id=500487. Maximum sequence length: 2049, sample length: 3891 +[default0]:Skipping sample id=858772. Maximum sequence length: 2049, sample length: 2650 +[default0]:Skipping sample id=1470473. Maximum sequence length: 2049, sample length: 3530 +[default0]:Skipping sample id=245742. Maximum sequence length: 2049, sample length: 3108 +[default0]:Skipping sample id=975493. Maximum sequence length: 2049, sample length: 2840 +[default0]:Skipping sample id=563475. Maximum sequence length: 2049, sample length: 3733 +[default0]:Skipping sample id=655980. Maximum sequence length: 2049, sample length: 4111 +[default0]:Skipping sample id=422474. Maximum sequence length: 2049, sample length: 3045 +[default0]:Skipping sample id=661151. Maximum sequence length: 2049, sample length: 5079 +[default0]:Skipping sample id=667134. Maximum sequence length: 2049, sample length: 3926 +[default0]:Skipping sample id=1460910. Maximum sequence length: 2049, sample length: 3082 +[default0]:Skipping sample id=473039. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=1530344. Maximum sequence length: 2049, sample length: 3223 +[default0]:Skipping sample id=41471. Maximum sequence length: 2049, sample length: 2566 +[default0]:Skipping sample id=356781. Maximum sequence length: 2049, sample length: 3262 +[default0]:Skipping sample id=521090. Maximum sequence length: 2049, sample length: 2289 +[default0]:Skipping sample id=1465204. Maximum sequence length: 2049, sample length: 2840 +[default0]:Skipping sample id=1451383. Maximum sequence length: 2049, sample length: 2377 +[default0]:Skipping sample id=1137077. Maximum sequence length: 2049, sample length: 2808 +[default0]:Skipping sample id=744778. Maximum sequence length: 2049, sample length: 2242 +[default0]:Skipping sample id=732066. Maximum sequence length: 2049, sample length: 3100 +[default0]:Skipping sample id=1108394. Maximum sequence length: 2049, sample length: 2486 +[default0]:Skipping sample id=164932. Maximum sequence length: 2049, sample length: 3080 +[default0]:Skipping sample id=764667. Maximum sequence length: 2049, sample length: 2864 +[default0]:Skipping sample id=1096532. Maximum sequence length: 2049, sample length: 2717 +[default0]:Skipping sample id=944962. Maximum sequence length: 2049, sample length: 3909 +[default0]:Skipping sample id=1436553. Maximum sequence length: 2049, sample length: 2337 +[default0]:Skipping sample id=282505. Maximum sequence length: 2049, sample length: 2171 +[default0]:Skipping sample id=736305. Maximum sequence length: 2049, sample length: 2181 +[default0]:Skipping sample id=1533760. Maximum sequence length: 2049, sample length: 4861 +[default0]:Skipping sample id=1481704. Maximum sequence length: 2049, sample length: 2809 +[default0]:Skipping sample id=867601. Maximum sequence length: 2049, sample length: 2069 +[default0]:Skipping sample id=1131032. Maximum sequence length: 2049, sample length: 2126 +[default0]:Skipping sample id=518701. Maximum sequence length: 2049, sample length: 2653 +[default0]:Skipping sample id=1295888. Maximum sequence length: 2049, sample length: 2632 +[default0]:Skipping sample id=114002. Maximum sequence length: 2049, sample length: 3008 +[default0]:Skipping sample id=723062. Maximum sequence length: 2049, sample length: 3918 +[default0]:Skipping sample id=1481712. Maximum sequence length: 2049, sample length: 3102 +[default0]:Skipping sample id=779542. Maximum sequence length: 2049, sample length: 3569 +[default0]:Skipping sample id=365121. Maximum sequence length: 2049, sample length: 2818 +[default0]:Skipping sample id=685299. Maximum sequence length: 2049, sample length: 2393 +[default0]:Skipping sample id=1529101. Maximum sequence length: 2049, sample length: 3743 +[default0]:Skipping sample id=298505. Maximum sequence length: 2049, sample length: 3214 +[default0]:Skipping sample id=1517742. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=1048698. Maximum sequence length: 2049, sample length: 2141 +[default0]:Skipping sample id=840081. Maximum sequence length: 2049, sample length: 2361 +[default0]:Skipping sample id=1233558. Maximum sequence length: 2049, sample length: 2296 +[default0]:Skipping sample id=1445146. Maximum sequence length: 2049, sample length: 2675 +[default0]:Skipping sample id=626419. Maximum sequence length: 2049, sample length: 3722 +[default0]:Skipping sample id=1030597. Maximum sequence length: 2049, sample length: 3389 +[default0]:Skipping sample id=1102132. Maximum sequence length: 2049, sample length: 2570 +[default0]:Skipping sample id=1002489. Maximum sequence length: 2049, sample length: 2276 +[default0]:Skipping sample id=461369. Maximum sequence length: 2049, sample length: 3479 +[default0]:Skipping sample id=1216483. Maximum sequence length: 2049, sample length: 2353 +[default0]:Skipping sample id=1129105. Maximum sequence length: 2049, sample length: 2570 +[default0]:Skipping sample id=1172227. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=25121. Maximum sequence length: 2049, sample length: 3407 +[default0]:Skipping sample id=1489285. Maximum sequence length: 2049, sample length: 3497 +[default0]:Skipping sample id=48697. Maximum sequence length: 2049, sample length: 5143 +[default0]:Skipping sample id=684179. Maximum sequence length: 2049, sample length: 3000 +[default0]:Skipping sample id=1037500. Maximum sequence length: 2049, sample length: 2576 +[default0]:Skipping sample id=1319809. Maximum sequence length: 2049, sample length: 3193 +[default0]:Skipping sample id=527322. Maximum sequence length: 2049, sample length: 2499 +[default0]:Skipping sample id=63417. Maximum sequence length: 2049, sample length: 2709 +[default0]:Skipping sample id=1169757. Maximum sequence length: 2049, sample length: 2109 +[default0]:Skipping sample id=253881. Maximum sequence length: 2049, sample length: 3465 +[default0]:Skipping sample id=1436109. Maximum sequence length: 2049, sample length: 5644 +[default0]:Skipping sample id=111325. Maximum sequence length: 2049, sample length: 2887 +[default0]:Skipping sample id=1180771. Maximum sequence length: 2049, sample length: 3619 +[default0]:Skipping sample id=1219392. Maximum sequence length: 2049, sample length: 4548 +[default0]:Skipping sample id=376943. Maximum sequence length: 2049, sample length: 2756 +[default0]:Skipping sample id=228321. Maximum sequence length: 2049, sample length: 3436 +[default0]:Skipping sample id=838520. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=827411. Maximum sequence length: 2049, sample length: 2687 +[default0]:Skipping sample id=553219. Maximum sequence length: 2049, sample length: 7003 +[default0]:Skipping sample id=1001824. Maximum sequence length: 2049, sample length: 2535 +[default0]:Skipping sample id=598908. Maximum sequence length: 2049, sample length: 2768 +[default0]:Skipping sample id=944012. Maximum sequence length: 2049, sample length: 2841 +[default0]:Skipping sample id=436607. Maximum sequence length: 2049, sample length: 2256 +[default0]:Skipping sample id=1506171. Maximum sequence length: 2049, sample length: 3828 +[default0]:Skipping sample id=696207. Maximum sequence length: 2049, sample length: 3257 +[default0]:Skipping sample id=849581. Maximum sequence length: 2049, sample length: 2854 +[default0]:Skipping sample id=937396. Maximum sequence length: 2049, sample length: 6236 +[default0]:Skipping sample id=468969. Maximum sequence length: 2049, sample length: 3374 +[default0]:Skipping sample id=1225210. Maximum sequence length: 2049, sample length: 2950 +[default0]:Skipping sample id=354693. Maximum sequence length: 2049, sample length: 2575 +[default0]:Skipping sample id=189131. Maximum sequence length: 2049, sample length: 4666 +[default0]:Skipping sample id=738182. Maximum sequence length: 2049, sample length: 2471 +[default0]:Skipping sample id=13278. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=154860. Maximum sequence length: 2049, sample length: 4193 +[default0]:Skipping sample id=307831. Maximum sequence length: 2049, sample length: 2724 +[default0]:Skipping sample id=1282646. Maximum sequence length: 2049, sample length: 2606 +[default0]:Skipping sample id=1496369. Maximum sequence length: 2049, sample length: 2630 +[default0]:Skipping sample id=1357695. Maximum sequence length: 2049, sample length: 3786 +[default0]:Skipping sample id=1064200. Maximum sequence length: 2049, sample length: 3014 +[default0]:Skipping sample id=292469. Maximum sequence length: 2049, sample length: 2598 +[default0]:Skipping sample id=459098. Maximum sequence length: 2049, sample length: 2694 +[default0]:Skipping sample id=1166053. Maximum sequence length: 2049, sample length: 5868 +[default0]:Skipping sample id=1530848. Maximum sequence length: 2049, sample length: 2523 +[default0]:Skipping sample id=509594. Maximum sequence length: 2049, sample length: 3145 +[default0]:Skipping sample id=685492. Maximum sequence length: 2049, sample length: 2581 +[default0]:Skipping sample id=1299588. Maximum sequence length: 2049, sample length: 2635 +[default0]:Skipping sample id=1532417. Maximum sequence length: 2049, sample length: 3601 +[default0]:Skipping sample id=930687. Maximum sequence length: 2049, sample length: 3245 +[default0]:Skipping sample id=489601. Maximum sequence length: 2049, sample length: 3381 +[default0]:Skipping sample id=1029019. Maximum sequence length: 2049, sample length: 2508 +[default0]:Skipping sample id=1550451. Maximum sequence length: 2049, sample length: 2416 +[default0]:Skipping sample id=537787. Maximum sequence length: 2049, sample length: 4224 +[default0]:Skipping sample id=286567. Maximum sequence length: 2049, sample length: 2123 +[default0]:Skipping sample id=274108. Maximum sequence length: 2049, sample length: 2184 +[default0]:Skipping sample id=1501972. Maximum sequence length: 2049, sample length: 2306 +[default0]:Skipping sample id=503679. Maximum sequence length: 2049, sample length: 3862 +[default0]:Skipping sample id=963199. Maximum sequence length: 2049, sample length: 2518 +[default0]:Skipping sample id=553777. Maximum sequence length: 2049, sample length: 3616 +[default0]:Skipping sample id=907873. Maximum sequence length: 2049, sample length: 2445 +[default0]:Skipping sample id=1208836. Maximum sequence length: 2049, sample length: 2431 +[default0]:Skipping sample id=132577. Maximum sequence length: 2049, sample length: 2457 +[default0]:Skipping sample id=1303121. Maximum sequence length: 2049, sample length: 2262 +[default0]:Skipping sample id=992951. Maximum sequence length: 2049, sample length: 3154 +[default0]:Skipping sample id=812018. Maximum sequence length: 2049, sample length: 2931 +[default0]:Skipping sample id=1569998. Maximum sequence length: 2049, sample length: 2949 +[default0]:Skipping sample id=222475. Maximum sequence length: 2049, sample length: 2420 +[default0]:Skipping sample id=662486. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=587413. Maximum sequence length: 2049, sample length: 2922 +[default0]:Skipping sample id=763755. Maximum sequence length: 2049, sample length: 3128 +[default0]:Skipping sample id=432469. Maximum sequence length: 2049, sample length: 3159 +[default0]:Skipping sample id=1293680. Maximum sequence length: 2049, sample length: 2755 +[default0]:Skipping sample id=815314. Maximum sequence length: 2049, sample length: 3036 +[default0]:Skipping sample id=1550044. Maximum sequence length: 2049, sample length: 2445 +[default0]:Skipping sample id=223337. Maximum sequence length: 2049, sample length: 2349 +[default0]:Skipping sample id=811347. Maximum sequence length: 2049, sample length: 3613 +[default0]:Skipping sample id=726832. Maximum sequence length: 2049, sample length: 2969 +[default0]:Skipping sample id=1248201. Maximum sequence length: 2049, sample length: 3020 +[default0]:Skipping sample id=1148358. Maximum sequence length: 2049, sample length: 2394 +[default0]:Skipping sample id=1288618. Maximum sequence length: 2049, sample length: 4479 +[default0]:Skipping sample id=672826. Maximum sequence length: 2049, sample length: 2985 +[default0]:Skipping sample id=1150004. Maximum sequence length: 2049, sample length: 3685 +[default0]:Skipping sample id=446657. Maximum sequence length: 2049, sample length: 4337 +[default0]:Skipping sample id=1559282. Maximum sequence length: 2049, sample length: 2653 +[default0]:Skipping sample id=1254645. Maximum sequence length: 2049, sample length: 2512 +[default0]:Skipping sample id=444289. Maximum sequence length: 2049, sample length: 4146 +[default0]:Skipping sample id=1228699. Maximum sequence length: 2049, sample length: 3495 +[default0]:Skipping sample id=599268. Maximum sequence length: 2049, sample length: 2118 +[default0]:Skipping sample id=789824. Maximum sequence length: 2049, sample length: 2667 +[default0]:Skipping sample id=1224580. Maximum sequence length: 2049, sample length: 4041 +[default0]:Skipping sample id=207864. Maximum sequence length: 2049, sample length: 2710 +[default0]:Skipping sample id=1203595. Maximum sequence length: 2049, sample length: 2425 +[default0]:Skipping sample id=1203573. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=987649. Maximum sequence length: 2049, sample length: 2562 +[default0]:Skipping sample id=486218. Maximum sequence length: 2049, sample length: 2163 +[default0]:Skipping sample id=742049. Maximum sequence length: 2049, sample length: 3280 +[default0]:Skipping sample id=350893. Maximum sequence length: 2049, sample length: 2693 +[default0]:Skipping sample id=571176. Maximum sequence length: 2049, sample length: 5210 +[default0]:Skipping sample id=359702. Maximum sequence length: 2049, sample length: 2248 +[default0]:Skipping sample id=1044843. Maximum sequence length: 2049, sample length: 5857 +[default0]:Skipping sample id=1419624. Maximum sequence length: 2049, sample length: 2590 +[default0]:Skipping sample id=935232. Maximum sequence length: 2049, sample length: 2164 +[default0]:Skipping sample id=557828. Maximum sequence length: 2049, sample length: 3263 +[default0]:Skipping sample id=1193679. Maximum sequence length: 2049, sample length: 3205 +[default0]:Skipping sample id=853370. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=444768. Maximum sequence length: 2049, sample length: 3809 +[default0]:Skipping sample id=438681. Maximum sequence length: 2049, sample length: 3321 +[default0]:Skipping sample id=1521154. Maximum sequence length: 2049, sample length: 2775 +[default0]:Skipping sample id=979906. Maximum sequence length: 2049, sample length: 2585 +[default0]:Skipping sample id=1405185. Maximum sequence length: 2049, sample length: 2924 +[default0]:Skipping sample id=205291. Maximum sequence length: 2049, sample length: 3695 +[default0]:Skipping sample id=790785. Maximum sequence length: 2049, sample length: 2283 +[default0]:Skipping sample id=185176. Maximum sequence length: 2049, sample length: 5466 +[default0]:Skipping sample id=1564395. Maximum sequence length: 2049, sample length: 2362 +[default0]:Skipping sample id=51722. Maximum sequence length: 2049, sample length: 2449 +[default0]:Skipping sample id=1053899. Maximum sequence length: 2049, sample length: 2656 +[default0]:Skipping sample id=74161. Maximum sequence length: 2049, sample length: 2306 +[default0]:Skipping sample id=1151888. Maximum sequence length: 2049, sample length: 2190 +[default0]:Skipping sample id=482892. Maximum sequence length: 2049, sample length: 2735 +[default0]:Skipping sample id=691913. Maximum sequence length: 2049, sample length: 2414 +[default0]:Skipping sample id=504461. Maximum sequence length: 2049, sample length: 2486 +[default0]:Skipping sample id=1521888. Maximum sequence length: 2049, sample length: 3473 +[default0]:Skipping sample id=986777. Maximum sequence length: 2049, sample length: 2061 +[default0]:Skipping sample id=215931. Maximum sequence length: 2049, sample length: 2199 +[default0]:Skipping sample id=1018223. Maximum sequence length: 2049, sample length: 2238 +[default0]:Skipping sample id=988165. Maximum sequence length: 2049, sample length: 2150 +[default0]:Skipping sample id=1039100. Maximum sequence length: 2049, sample length: 2840 +[default0]:Skipping sample id=1016410. Maximum sequence length: 2049, sample length: 3316 +[default0]:Skipping sample id=1037130. Maximum sequence length: 2049, sample length: 3837 +[default0]:Skipping sample id=914406. Maximum sequence length: 2049, sample length: 3456 +[default0]:Skipping sample id=247037. Maximum sequence length: 2049, sample length: 3173 +[default0]:Skipping sample id=1316919. Maximum sequence length: 2049, sample length: 2704 +[default0]:Skipping sample id=831841. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=1253561. Maximum sequence length: 2049, sample length: 3505 +[default0]:Skipping sample id=1214207. Maximum sequence length: 2049, sample length: 3441 +[default0]:Skipping sample id=255909. Maximum sequence length: 2049, sample length: 2263 +[default0]:Skipping sample id=619644. Maximum sequence length: 2049, sample length: 3317 +[default0]:Skipping sample id=1045678. Maximum sequence length: 2049, sample length: 2221 +[default0]:Skipping sample id=20580. Maximum sequence length: 2049, sample length: 3209 +[default0]:Skipping sample id=871146. Maximum sequence length: 2049, sample length: 2466 +[default0]:Skipping sample id=798627. Maximum sequence length: 2049, sample length: 2734 +[default0]:Skipping sample id=1050890. Maximum sequence length: 2049, sample length: 3478 +[default0]:Skipping sample id=83981. Maximum sequence length: 2049, sample length: 2075 +[default0]:Skipping sample id=913705. Maximum sequence length: 2049, sample length: 2518 +[default0]:Skipping sample id=128307. Maximum sequence length: 2049, sample length: 5045 +[default0]:Skipping sample id=476603. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=348549. Maximum sequence length: 2049, sample length: 5150 +[default0]:Skipping sample id=815430. Maximum sequence length: 2049, sample length: 2714 +[default0]:Skipping sample id=714445. Maximum sequence length: 2049, sample length: 2263 +[default0]:Skipping sample id=1554481. Maximum sequence length: 2049, sample length: 4693 +[default0]:Skipping sample id=1258082. Maximum sequence length: 2049, sample length: 2647 +[default0]:Skipping sample id=460454. Maximum sequence length: 2049, sample length: 2555 +[default0]:Skipping sample id=662405. Maximum sequence length: 2049, sample length: 4142 +[default0]:Skipping sample id=758829. Maximum sequence length: 2049, sample length: 3071 +[default0]:Skipping sample id=67739. Maximum sequence length: 2049, sample length: 2262 +[default0]:Skipping sample id=99855. Maximum sequence length: 2049, sample length: 2832 +[default0]:Skipping sample id=1311470. Maximum sequence length: 2049, sample length: 2226 +[default0]:Skipping sample id=1369788. Maximum sequence length: 2049, sample length: 2074 +[default0]:Skipping sample id=514858. Maximum sequence length: 2049, sample length: 3127 +[default0]:Skipping sample id=1160356. Maximum sequence length: 2049, sample length: 3480 +[default0]:Skipping sample id=1532158. Maximum sequence length: 2049, sample length: 2653 +[default0]:Skipping sample id=833744. Maximum sequence length: 2049, sample length: 3766 +[default0]:Skipping sample id=1256366. Maximum sequence length: 2049, sample length: 3066 +[default0]:Skipping sample id=1503129. Maximum sequence length: 2049, sample length: 2591 +[default0]:Skipping sample id=787636. Maximum sequence length: 2049, sample length: 5907 +[default0]:Skipping sample id=173702. Maximum sequence length: 2049, sample length: 3461 +[default0]:Skipping sample id=1133708. Maximum sequence length: 2049, sample length: 2855 +[default0]:Skipping sample id=995187. Maximum sequence length: 2049, sample length: 3426 +[default0]:Skipping sample id=1288868. Maximum sequence length: 2049, sample length: 3352 +[default0]:Skipping sample id=1537856. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=1448018. Maximum sequence length: 2049, sample length: 3451 +[default0]:Skipping sample id=1381181. Maximum sequence length: 2049, sample length: 4344 +[default0]:Skipping sample id=1382932. Maximum sequence length: 2049, sample length: 2666 +[default0]:Skipping sample id=150941. Maximum sequence length: 2049, sample length: 2185 +[default0]:Skipping sample id=686225. Maximum sequence length: 2049, sample length: 4630 +[default0]:Skipping sample id=1308361. Maximum sequence length: 2049, sample length: 3336 +[default0]:Skipping sample id=701999. Maximum sequence length: 2049, sample length: 2402 +[default0]:Skipping sample id=981285. Maximum sequence length: 2049, sample length: 3880 +[default0]:Skipping sample id=1183185. Maximum sequence length: 2049, sample length: 2542 +[default0]:Skipping sample id=996752. Maximum sequence length: 2049, sample length: 3389 +[default0]:Skipping sample id=1057964. Maximum sequence length: 2049, sample length: 2666 +[default0]:Skipping sample id=871472. Maximum sequence length: 2049, sample length: 3579 +[default0]:Skipping sample id=172813. Maximum sequence length: 2049, sample length: 2407 +[default0]:Skipping sample id=352428. Maximum sequence length: 2049, sample length: 2139 +[default0]:Skipping sample id=244141. Maximum sequence length: 2049, sample length: 3322 +[default0]:Skipping sample id=862576. Maximum sequence length: 2049, sample length: 2502 +[default0]:Skipping sample id=1056178. Maximum sequence length: 2049, sample length: 3748 +[default0]:Skipping sample id=631546. Maximum sequence length: 2049, sample length: 2722 +[default0]:Skipping sample id=827955. Maximum sequence length: 2049, sample length: 2314 +[default0]:Skipping sample id=1142831. Maximum sequence length: 2049, sample length: 4369 +[default0]:Skipping sample id=951901. Maximum sequence length: 2049, sample length: 4654 +[default0]:Skipping sample id=489567. Maximum sequence length: 2049, sample length: 2221 +[default0]:Skipping sample id=638926. Maximum sequence length: 2049, sample length: 3559 +[default0]:Skipping sample id=338115. Maximum sequence length: 2049, sample length: 2208 +[default0]:Skipping sample id=506976. Maximum sequence length: 2049, sample length: 2096 +[default0]:Skipping sample id=800826. Maximum sequence length: 2049, sample length: 2222 +[default0]:Skipping sample id=469436. Maximum sequence length: 2049, sample length: 2272 +[default0]:Skipping sample id=737063. Maximum sequence length: 2049, sample length: 2324 +[default0]:Skipping sample id=668108. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=69907. Maximum sequence length: 2049, sample length: 2608 +[default0]:Skipping sample id=1115335. Maximum sequence length: 2049, sample length: 3401 +[default0]:Skipping sample id=842636. Maximum sequence length: 2049, sample length: 3732 +[default0]:Skipping sample id=836063. Maximum sequence length: 2049, sample length: 2298 +[default0]:Skipping sample id=290698. Maximum sequence length: 2049, sample length: 2386 +[default0]:Skipping sample id=410610. Maximum sequence length: 2049, sample length: 2231 +[default0]:Skipping sample id=270378. Maximum sequence length: 2049, sample length: 2503 +[default0]:Skipping sample id=400550. Maximum sequence length: 2049, sample length: 2458 +[default0]:Skipping sample id=1198164. Maximum sequence length: 2049, sample length: 2064 +[default0]:Skipping sample id=721259. Maximum sequence length: 2049, sample length: 2967 +[default0]:Skipping sample id=38593. Maximum sequence length: 2049, sample length: 3883 +[default0]:Skipping sample id=644829. Maximum sequence length: 2049, sample length: 3125 +[default0]:Skipping sample id=478596. Maximum sequence length: 2049, sample length: 3180 +[default0]:Skipping sample id=854461. Maximum sequence length: 2049, sample length: 3157 +[default0]:Skipping sample id=1261607. Maximum sequence length: 2049, sample length: 2926 +[default0]:Skipping sample id=669362. Maximum sequence length: 2049, sample length: 4355 +[default0]:Skipping sample id=805450. Maximum sequence length: 2049, sample length: 2756 +[default0]:Skipping sample id=1363094. Maximum sequence length: 2049, sample length: 3345 +[default0]:Skipping sample id=1088244. Maximum sequence length: 2049, sample length: 2513 +[default0]:Skipping sample id=1493318. Maximum sequence length: 2049, sample length: 3135 +[default0]:Skipping sample id=323216. Maximum sequence length: 2049, sample length: 4291 +[default0]:Skipping sample id=687501. Maximum sequence length: 2049, sample length: 4455 +[default0]:Skipping sample id=224068. Maximum sequence length: 2049, sample length: 3598 +[default0]:Skipping sample id=819996. Maximum sequence length: 2049, sample length: 2882 +[default0]:Skipping sample id=634170. Maximum sequence length: 2049, sample length: 2161 +[default0]:Skipping sample id=469711. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=745659. Maximum sequence length: 2049, sample length: 2194 +[default0]:Skipping sample id=651572. Maximum sequence length: 2049, sample length: 2870 +[default0]:Skipping sample id=704266. Maximum sequence length: 2049, sample length: 2064 +[default0]:Skipping sample id=400978. Maximum sequence length: 2049, sample length: 2173 +[default0]:Skipping sample id=1400774. Maximum sequence length: 2049, sample length: 2742 +[default0]:Skipping sample id=1511280. Maximum sequence length: 2049, sample length: 2067 +[default0]:Skipping sample id=972925. Maximum sequence length: 2049, sample length: 2445 +[default0]:Skipping sample id=244767. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=41249. Maximum sequence length: 2049, sample length: 2396 +[default0]:Skipping sample id=1233249. Maximum sequence length: 2049, sample length: 2902 +[default0]:Skipping sample id=1030833. Maximum sequence length: 2049, sample length: 3127 +[default0]:Skipping sample id=1509062. Maximum sequence length: 2049, sample length: 3433 +[default0]:Skipping sample id=244156. Maximum sequence length: 2049, sample length: 2825 +[default0]:Skipping sample id=831602. Maximum sequence length: 2049, sample length: 2920 +[default0]:Skipping sample id=1224164. Maximum sequence length: 2049, sample length: 2417 +[default0]:Skipping sample id=1392578. Maximum sequence length: 2049, sample length: 2660 +[default0]:Skipping sample id=1455100. Maximum sequence length: 2049, sample length: 2364 +[default0]:Skipping sample id=668068. Maximum sequence length: 2049, sample length: 2136 +[default0]:Skipping sample id=331977. Maximum sequence length: 2049, sample length: 5474 +[default0]:Skipping sample id=1524800. Maximum sequence length: 2049, sample length: 2529 +[default0]:Skipping sample id=829271. Maximum sequence length: 2049, sample length: 2877 +[default0]:Skipping sample id=1470246. Maximum sequence length: 2049, sample length: 3379 +[default0]:Skipping sample id=1177982. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=96044. Maximum sequence length: 2049, sample length: 2129 +[default0]:Skipping sample id=659029. Maximum sequence length: 2049, sample length: 3286 +[default0]:Skipping sample id=518980. Maximum sequence length: 2049, sample length: 3275 +[default0]:Skipping sample id=1378732. Maximum sequence length: 2049, sample length: 3368 +[default0]:Skipping sample id=1136743. Maximum sequence length: 2049, sample length: 3166 +[default0]:Skipping sample id=1178718. Maximum sequence length: 2049, sample length: 2228 +[default0]:Skipping sample id=768546. Maximum sequence length: 2049, sample length: 2371 +[default0]:Skipping sample id=1249491. Maximum sequence length: 2049, sample length: 3128 +[default0]:Skipping sample id=869694. Maximum sequence length: 2049, sample length: 2169 +[default0]:Skipping sample id=1544847. Maximum sequence length: 2049, sample length: 4089 +[default0]:Skipping sample id=690896. Maximum sequence length: 2049, sample length: 4217 +[default0]:Skipping sample id=1100267. Maximum sequence length: 2049, sample length: 2550 +[default0]:Skipping sample id=884942. Maximum sequence length: 2049, sample length: 2708 +[default0]:Skipping sample id=654366. Maximum sequence length: 2049, sample length: 2600 +[default0]:Skipping sample id=1294867. Maximum sequence length: 2049, sample length: 2926 +[default0]:Skipping sample id=781818. Maximum sequence length: 2049, sample length: 3441 +[default0]:Skipping sample id=1325917. Maximum sequence length: 2049, sample length: 2904 +[default0]:Skipping sample id=364180. Maximum sequence length: 2049, sample length: 2314 +[default0]:Skipping sample id=637927. Maximum sequence length: 2049, sample length: 3018 +[default0]:Skipping sample id=826255. Maximum sequence length: 2049, sample length: 2352 +[default0]:Skipping sample id=453490. Maximum sequence length: 2049, sample length: 2647 +[default0]:Skipping sample id=220704. Maximum sequence length: 2049, sample length: 2305 +[default0]:Skipping sample id=3768. Maximum sequence length: 2049, sample length: 4134 +[default0]:Skipping sample id=383455. Maximum sequence length: 2049, sample length: 2344 +[default0]:Skipping sample id=751435. Maximum sequence length: 2049, sample length: 2907 +[default0]:Skipping sample id=1552744. Maximum sequence length: 2049, sample length: 2946 +[default0]:Skipping sample id=605449. Maximum sequence length: 2049, sample length: 2855 +[default0]:Skipping sample id=512518. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=1558402. Maximum sequence length: 2049, sample length: 3850 +[default0]:Skipping sample id=1497624. Maximum sequence length: 2049, sample length: 2703 +[default0]:Skipping sample id=1306308. Maximum sequence length: 2049, sample length: 2736 +[default0]:Skipping sample id=1070556. Maximum sequence length: 2049, sample length: 3592 +[default0]:Skipping sample id=1161812. Maximum sequence length: 2049, sample length: 2282 +[default0]:Skipping sample id=398316. Maximum sequence length: 2049, sample length: 2223 +[default0]:Skipping sample id=1189100. Maximum sequence length: 2049, sample length: 4478 +[default0]:Skipping sample id=1014537. Maximum sequence length: 2049, sample length: 3471 +[default0]:Skipping sample id=1488515. Maximum sequence length: 2049, sample length: 2279 +[default0]:Skipping sample id=792294. Maximum sequence length: 2049, sample length: 2447 +[default0]:Skipping sample id=1432881. Maximum sequence length: 2049, sample length: 3735 +[default0]:Skipping sample id=1037734. Maximum sequence length: 2049, sample length: 2204 +[default0]:Skipping sample id=377185. Maximum sequence length: 2049, sample length: 2052 +[default0]:Skipping sample id=727963. Maximum sequence length: 2049, sample length: 2282 +[default0]:Skipping sample id=184934. Maximum sequence length: 2049, sample length: 2711 +[default0]:Skipping sample id=307163. Maximum sequence length: 2049, sample length: 2174 +[default0]:Skipping sample id=574386. Maximum sequence length: 2049, sample length: 2173 +[default0]:Skipping sample id=831559. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=1318386. Maximum sequence length: 2049, sample length: 5885 +[default0]:Skipping sample id=154268. Maximum sequence length: 2049, sample length: 2811 +[default0]:Skipping sample id=260976. Maximum sequence length: 2049, sample length: 3699 +[default0]:Skipping sample id=97048. Maximum sequence length: 2049, sample length: 2112 +[default0]:Skipping sample id=1254133. Maximum sequence length: 2049, sample length: 2929 +[default0]:Skipping sample id=1287960. Maximum sequence length: 2049, sample length: 2059 +[default0]:Skipping sample id=921828. Maximum sequence length: 2049, sample length: 3372 +[default0]:Skipping sample id=406299. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=569542. Maximum sequence length: 2049, sample length: 3136 +[default0]:Skipping sample id=1548189. Maximum sequence length: 2049, sample length: 2411 +[default0]:Skipping sample id=1509278. Maximum sequence length: 2049, sample length: 2755 +[default0]:Skipping sample id=1063945. Maximum sequence length: 2049, sample length: 2612 +[default0]:Skipping sample id=1459227. Maximum sequence length: 2049, sample length: 5896 +[default0]:Skipping sample id=1301299. Maximum sequence length: 2049, sample length: 2500 +[default0]:Skipping sample id=1235938. Maximum sequence length: 2049, sample length: 2427 +[default0]:Skipping sample id=293047. Maximum sequence length: 2049, sample length: 2113 +[default0]:Skipping sample id=1451760. Maximum sequence length: 2049, sample length: 2850 +[default0]:Skipping sample id=1420887. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=831899. Maximum sequence length: 2049, sample length: 2653 +[default0]:Skipping sample id=1434623. Maximum sequence length: 2049, sample length: 3601 +[default0]:Skipping sample id=34020. Maximum sequence length: 2049, sample length: 2996 +[default0]:Skipping sample id=753437. Maximum sequence length: 2049, sample length: 3874 +[default0]:Skipping sample id=788582. Maximum sequence length: 2049, sample length: 3076 +[default0]:Skipping sample id=551044. Maximum sequence length: 2049, sample length: 2358 +[default0]:Skipping sample id=711974. Maximum sequence length: 2049, sample length: 2387 +[default0]:Skipping sample id=416332. Maximum sequence length: 2049, sample length: 2855 +[default0]:Skipping sample id=874036. Maximum sequence length: 2049, sample length: 2594 +[default0]:Skipping sample id=303948. Maximum sequence length: 2049, sample length: 5598 +[default0]:Skipping sample id=1185756. Maximum sequence length: 2049, sample length: 3278 +[default0]:Skipping sample id=91306. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=1423586. Maximum sequence length: 2049, sample length: 4264 +[default0]:Skipping sample id=1502872. Maximum sequence length: 2049, sample length: 2424 +[default0]:Skipping sample id=430392. Maximum sequence length: 2049, sample length: 3390 +[default0]:Skipping sample id=199850. Maximum sequence length: 2049, sample length: 2364 +[default0]:Skipping sample id=1468476. Maximum sequence length: 2049, sample length: 2535 +[default0]:Skipping sample id=792907. Maximum sequence length: 2049, sample length: 3431 +[default0]:Skipping sample id=1014115. Maximum sequence length: 2049, sample length: 5127 +[default0]:Skipping sample id=90750. Maximum sequence length: 2049, sample length: 2952 +[default0]:Skipping sample id=811004. Maximum sequence length: 2049, sample length: 3190 +[default0]:Skipping sample id=193039. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=288712. Maximum sequence length: 2049, sample length: 2748 +[default0]:Skipping sample id=1155683. Maximum sequence length: 2049, sample length: 2080 +[default0]:Skipping sample id=890800. Maximum sequence length: 2049, sample length: 2456 +[default0]:Skipping sample id=664825. Maximum sequence length: 2049, sample length: 2667 +[default0]:Skipping sample id=599765. Maximum sequence length: 2049, sample length: 4977 +[default0]:Skipping sample id=26946. Maximum sequence length: 2049, sample length: 3945 +[default0]:Skipping sample id=435702. Maximum sequence length: 2049, sample length: 6653 +[default0]:Skipping sample id=616980. Maximum sequence length: 2049, sample length: 2910 +[default0]:Skipping sample id=1435350. Maximum sequence length: 2049, sample length: 2495 +[default0]:Skipping sample id=41203. Maximum sequence length: 2049, sample length: 4088 +[default0]:Skipping sample id=1018355. Maximum sequence length: 2049, sample length: 2077 +[default0]:Skipping sample id=1132198. Maximum sequence length: 2049, sample length: 2475 +[default0]:Skipping sample id=1058892. Maximum sequence length: 2049, sample length: 3116 +[default0]:Skipping sample id=1199706. Maximum sequence length: 2049, sample length: 2549 +[default0]:Skipping sample id=216337. Maximum sequence length: 2049, sample length: 2303 +[default0]:Skipping sample id=386555. Maximum sequence length: 2049, sample length: 4546 +[default0]:Skipping sample id=897492. Maximum sequence length: 2049, sample length: 2156 +[default0]:Skipping sample id=480161. Maximum sequence length: 2049, sample length: 2770 +[default0]:Skipping sample id=207375. Maximum sequence length: 2049, sample length: 3301 +[default0]:Skipping sample id=1508987. Maximum sequence length: 2049, sample length: 2962 +[default0]:Skipping sample id=848379. Maximum sequence length: 2049, sample length: 2769 +[default0]:Skipping sample id=1074633. Maximum sequence length: 2049, sample length: 2766 +[default0]:Skipping sample id=1015780. Maximum sequence length: 2049, sample length: 3296 +[default0]:Skipping sample id=721691. Maximum sequence length: 2049, sample length: 3383 +[default0]:Skipping sample id=204757. Maximum sequence length: 2049, sample length: 2407 +[default0]:Skipping sample id=703434. Maximum sequence length: 2049, sample length: 2941 +[default0]:Skipping sample id=1208525. Maximum sequence length: 2049, sample length: 2586 +[default0]:Skipping sample id=139754. Maximum sequence length: 2049, sample length: 2791 +[default0]:Skipping sample id=802954. Maximum sequence length: 2049, sample length: 4606 +[default0]:Skipping sample id=793013. Maximum sequence length: 2049, sample length: 4415 +[default0]:Skipping sample id=1338355. Maximum sequence length: 2049, sample length: 2194 +[default0]:Skipping sample id=1196956. Maximum sequence length: 2049, sample length: 2612 +[default0]:Skipping sample id=655444. Maximum sequence length: 2049, sample length: 2488 +[default0]:Skipping sample id=136904. Maximum sequence length: 2049, sample length: 2447 +[default0]:Skipping sample id=79743. Maximum sequence length: 2049, sample length: 2316 +[default0]:Skipping sample id=822405. Maximum sequence length: 2049, sample length: 2366 +[default0]:Skipping sample id=1145235. Maximum sequence length: 2049, sample length: 2850 +[default0]:Skipping sample id=1537724. Maximum sequence length: 2049, sample length: 2667 +[default0]:Skipping sample id=591587. Maximum sequence length: 2049, sample length: 2123 +[default0]:Skipping sample id=518234. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=117649. Maximum sequence length: 2049, sample length: 5251 +[default0]:Skipping sample id=1024844. Maximum sequence length: 2049, sample length: 2743 +[default0]:Skipping sample id=816967. Maximum sequence length: 2049, sample length: 3916 +[default0]:Skipping sample id=1340465. Maximum sequence length: 2049, sample length: 2675 +[default0]:Skipping sample id=210090. Maximum sequence length: 2049, sample length: 2496 +[default0]:Skipping sample id=1192999. Maximum sequence length: 2049, sample length: 3321 +[default0]:Skipping sample id=1170239. Maximum sequence length: 2049, sample length: 2245 +[default0]:Skipping sample id=1330. Maximum sequence length: 2049, sample length: 2444 +[default0]:Skipping sample id=1481379. Maximum sequence length: 2049, sample length: 4019 +[default0]:Skipping sample id=1076128. Maximum sequence length: 2049, sample length: 2316 +[default0]:Skipping sample id=932425. Maximum sequence length: 2049, sample length: 4149 +[default0]:Skipping sample id=1412470. Maximum sequence length: 2049, sample length: 2436 +[default0]:Skipping sample id=1505525. Maximum sequence length: 2049, sample length: 3119 +[default0]:Skipping sample id=1166874. Maximum sequence length: 2049, sample length: 6775 +[default0]:Skipping sample id=1404682. Maximum sequence length: 2049, sample length: 3046 +[default0]:Skipping sample id=1208044. Maximum sequence length: 2049, sample length: 2985 +[default0]:Skipping sample id=959569. Maximum sequence length: 2049, sample length: 2595 +[default0]:Skipping sample id=1104665. Maximum sequence length: 2049, sample length: 3178 +[default0]:Skipping sample id=767379. Maximum sequence length: 2049, sample length: 2374 +[default0]:Skipping sample id=526883. Maximum sequence length: 2049, sample length: 2548 +[default0]:Skipping sample id=1093504. Maximum sequence length: 2049, sample length: 3260 +[default0]:Skipping sample id=292925. Maximum sequence length: 2049, sample length: 2505 +[default0]:Skipping sample id=1426285. Maximum sequence length: 2049, sample length: 3212 +[default0]:Skipping sample id=1289621. Maximum sequence length: 2049, sample length: 2779 +[default0]:Skipping sample id=514212. Maximum sequence length: 2049, sample length: 2574 +[default0]:Skipping sample id=1342806. Maximum sequence length: 2049, sample length: 5390 +[default0]:Skipping sample id=1459406. Maximum sequence length: 2049, sample length: 2826 +[default0]:Skipping sample id=643582. Maximum sequence length: 2049, sample length: 3819 +[default0]:Skipping sample id=173053. Maximum sequence length: 2049, sample length: 7654 +[default0]:Skipping sample id=814660. Maximum sequence length: 2049, sample length: 2723 +[default0]:Skipping sample id=961479. Maximum sequence length: 2049, sample length: 3192 +[default0]:Skipping sample id=1227257. Maximum sequence length: 2049, sample length: 5356 +[default0]:Skipping sample id=460252. Maximum sequence length: 2049, sample length: 4656 +[default0]:Skipping sample id=467788. Maximum sequence length: 2049, sample length: 2800 +[default0]:Skipping sample id=1320781. Maximum sequence length: 2049, sample length: 2640 +[default0]:Skipping sample id=319986. Maximum sequence length: 2049, sample length: 2082 +[default0]:Skipping sample id=10144. Maximum sequence length: 2049, sample length: 2082 +[default0]:Skipping sample id=760983. Maximum sequence length: 2049, sample length: 2415 +[default0]:Skipping sample id=1173048. Maximum sequence length: 2049, sample length: 2359 +[default0]:Skipping sample id=728127. Maximum sequence length: 2049, sample length: 2053 +[default0]:Skipping sample id=1421757. Maximum sequence length: 2049, sample length: 2956 +[default0]:Skipping sample id=670484. Maximum sequence length: 2049, sample length: 2441 +[default0]:Skipping sample id=686118. Maximum sequence length: 2049, sample length: 2656 +[default0]:Skipping sample id=1407459. Maximum sequence length: 2049, sample length: 3288 +[default0]:Skipping sample id=75468. Maximum sequence length: 2049, sample length: 2614 +[default0]:Skipping sample id=1493187. Maximum sequence length: 2049, sample length: 2279 +[default0]:Skipping sample id=38774. Maximum sequence length: 2049, sample length: 3370 +[default0]:Skipping sample id=905376. Maximum sequence length: 2049, sample length: 2107 +[default0]:Skipping sample id=54017. Maximum sequence length: 2049, sample length: 3964 +[default0]:Skipping sample id=268101. Maximum sequence length: 2049, sample length: 5065 +[default0]:Skipping sample id=87924. Maximum sequence length: 2049, sample length: 2080 +[default0]:Skipping sample id=1254042. Maximum sequence length: 2049, sample length: 4431 +[default0]:Skipping sample id=417777. Maximum sequence length: 2049, sample length: 2094 +[default0]:Skipping sample id=751823. Maximum sequence length: 2049, sample length: 2483 +[default0]:Skipping sample id=1313811. Maximum sequence length: 2049, sample length: 5581 +[default0]:Skipping sample id=1538581. Maximum sequence length: 2049, sample length: 2380 +[default0]:Skipping sample id=954563. Maximum sequence length: 2049, sample length: 4686 +[default0]:Skipping sample id=948709. Maximum sequence length: 2049, sample length: 3018 +[default0]:Skipping sample id=123982. Maximum sequence length: 2049, sample length: 3789 +[default0]:Skipping sample id=529233. Maximum sequence length: 2049, sample length: 2459 +[default0]:Skipping sample id=1159255. Maximum sequence length: 2049, sample length: 2222 +[default0]:Skipping sample id=936072. Maximum sequence length: 2049, sample length: 5383 +[default0]:Skipping sample id=361323. Maximum sequence length: 2049, sample length: 2389 +[default0]:Skipping sample id=1004043. Maximum sequence length: 2049, sample length: 2199 +[default0]:Skipping sample id=1088575. Maximum sequence length: 2049, sample length: 4241 +[default0]:Skipping sample id=690594. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=448476. Maximum sequence length: 2049, sample length: 2826 +[default0]:Skipping sample id=1441857. Maximum sequence length: 2049, sample length: 3060 +[default0]:Skipping sample id=614852. Maximum sequence length: 2049, sample length: 3735 +[default0]:Skipping sample id=903731. Maximum sequence length: 2049, sample length: 6213 +[default0]:Skipping sample id=143573. Maximum sequence length: 2049, sample length: 2249 +[default0]:Skipping sample id=598834. Maximum sequence length: 2049, sample length: 2182 +[default0]:Skipping sample id=449012. Maximum sequence length: 2049, sample length: 2339 +[default0]:Skipping sample id=937368. Maximum sequence length: 2049, sample length: 2320 +[default0]:Skipping sample id=359912. Maximum sequence length: 2049, sample length: 2886 +[default0]:Skipping sample id=62789. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=472368. Maximum sequence length: 2049, sample length: 2124 +[default0]:Skipping sample id=112143. Maximum sequence length: 2049, sample length: 2193 +[default0]:Skipping sample id=715522. Maximum sequence length: 2049, sample length: 2221 +[default0]:Skipping sample id=415669. Maximum sequence length: 2049, sample length: 3325 +[default0]:Skipping sample id=904569. Maximum sequence length: 2049, sample length: 2986 +[default0]:Skipping sample id=246147. Maximum sequence length: 2049, sample length: 2182 +[default0]:Skipping sample id=1079642. Maximum sequence length: 2049, sample length: 3041 +[default0]:Skipping sample id=386547. Maximum sequence length: 2049, sample length: 2924 +[default0]:Skipping sample id=852742. Maximum sequence length: 2049, sample length: 3023 +[default0]:Skipping sample id=472015. Maximum sequence length: 2049, sample length: 2634 +[default0]:Skipping sample id=922420. Maximum sequence length: 2049, sample length: 2060 +[default0]:Skipping sample id=880208. Maximum sequence length: 2049, sample length: 3870 +[default0]:Skipping sample id=267877. Maximum sequence length: 2049, sample length: 2104 +[default0]:Skipping sample id=1223891. Maximum sequence length: 2049, sample length: 2190 +[default0]:Skipping sample id=691943. Maximum sequence length: 2049, sample length: 5608 +[default0]:Skipping sample id=1509571. Maximum sequence length: 2049, sample length: 3302 +[default0]:Skipping sample id=837765. Maximum sequence length: 2049, sample length: 2253 +[default0]:Skipping sample id=299910. Maximum sequence length: 2049, sample length: 4538 +[default0]:Skipping sample id=566226. Maximum sequence length: 2049, sample length: 2364 +[default0]:Skipping sample id=366060. Maximum sequence length: 2049, sample length: 2395 +[default0]:Skipping sample id=868150. Maximum sequence length: 2049, sample length: 3113 +[default0]:Skipping sample id=231587. Maximum sequence length: 2049, sample length: 2724 +[default0]:Skipping sample id=476740. Maximum sequence length: 2049, sample length: 2261 +[default0]:Skipping sample id=277375. Maximum sequence length: 2049, sample length: 2658 +[default0]:Skipping sample id=1523381. Maximum sequence length: 2049, sample length: 2843 +[default0]:Skipping sample id=1503728. Maximum sequence length: 2049, sample length: 2921 +[default0]:Skipping sample id=1031403. Maximum sequence length: 2049, sample length: 2099 +[default0]:Skipping sample id=185386. Maximum sequence length: 2049, sample length: 2114 +[default0]:Skipping sample id=1313136. Maximum sequence length: 2049, sample length: 2345 +[default0]:Skipping sample id=1274340. Maximum sequence length: 2049, sample length: 4153 +[default0]:Skipping sample id=421590. Maximum sequence length: 2049, sample length: 2197 +[default0]:Skipping sample id=267286. Maximum sequence length: 2049, sample length: 2247 +[default0]:Skipping sample id=1124090. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=494521. Maximum sequence length: 2049, sample length: 2429 +[default0]:Skipping sample id=1356523. Maximum sequence length: 2049, sample length: 2444 +[default0]:Skipping sample id=835327. Maximum sequence length: 2049, sample length: 2801 +[default0]:Skipping sample id=303059. Maximum sequence length: 2049, sample length: 2672 +[default0]:Skipping sample id=195473. Maximum sequence length: 2049, sample length: 2970 +[default0]:Skipping sample id=1445573. Maximum sequence length: 2049, sample length: 2709 +[default0]:Skipping sample id=1521204. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=1372174. Maximum sequence length: 2049, sample length: 2137 +[default0]:Skipping sample id=585479. Maximum sequence length: 2049, sample length: 2052 +[default0]:Skipping sample id=84560. Maximum sequence length: 2049, sample length: 2754 +[default0]:Skipping sample id=370019. Maximum sequence length: 2049, sample length: 2623 +[default0]:Skipping sample id=871462. Maximum sequence length: 2049, sample length: 4394 +[default0]:Skipping sample id=731526. Maximum sequence length: 2049, sample length: 2812 +[default0]:Skipping sample id=1354050. Maximum sequence length: 2049, sample length: 3081 +[default0]:Skipping sample id=45539. Maximum sequence length: 2049, sample length: 2359 +[default0]:Skipping sample id=1459282. Maximum sequence length: 2049, sample length: 2132 +[default0]:Skipping sample id=1195699. Maximum sequence length: 2049, sample length: 2523 +[default0]:Skipping sample id=742783. Maximum sequence length: 2049, sample length: 4093 +[default0]:Skipping sample id=445753. Maximum sequence length: 2049, sample length: 2346 +[default0]:Skipping sample id=1373274. Maximum sequence length: 2049, sample length: 2872 +[default0]:Skipping sample id=146625. Maximum sequence length: 2049, sample length: 2102 +[default0]:Skipping sample id=1258190. Maximum sequence length: 2049, sample length: 2264 +[default0]:Skipping sample id=228016. Maximum sequence length: 2049, sample length: 4651 +[default0]:Skipping sample id=886519. Maximum sequence length: 2049, sample length: 3099 +[default0]:Skipping sample id=954105. Maximum sequence length: 2049, sample length: 3079 +[default0]:Skipping sample id=696583. Maximum sequence length: 2049, sample length: 4237 +[default0]:Skipping sample id=274926. Maximum sequence length: 2049, sample length: 4240 +[default0]:Skipping sample id=226498. Maximum sequence length: 2049, sample length: 2372 +[default0]:Skipping sample id=640432. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=1168561. Maximum sequence length: 2049, sample length: 4563 +[default0]:Skipping sample id=1403042. Maximum sequence length: 2049, sample length: 2345 +[default0]:Skipping sample id=653917. Maximum sequence length: 2049, sample length: 2642 +[default0]:Skipping sample id=441471. Maximum sequence length: 2049, sample length: 2446 +[default0]:Skipping sample id=343169. Maximum sequence length: 2049, sample length: 3086 +[default0]:Skipping sample id=437142. Maximum sequence length: 2049, sample length: 3906 +[default0]:Skipping sample id=71914. Maximum sequence length: 2049, sample length: 3757 +[default0]:Skipping sample id=100485. Maximum sequence length: 2049, sample length: 2297 +[default0]:Skipping sample id=1316324. Maximum sequence length: 2049, sample length: 2251 +[default0]:Skipping sample id=927411. Maximum sequence length: 2049, sample length: 2709 +[default0]:Skipping sample id=80680. Maximum sequence length: 2049, sample length: 2426 +[default0]:Skipping sample id=1557784. Maximum sequence length: 2049, sample length: 2931 +[default0]:Skipping sample id=1425724. Maximum sequence length: 2049, sample length: 2365 +[default0]:Skipping sample id=1329416. Maximum sequence length: 2049, sample length: 2552 +[default0]:Skipping sample id=579733. Maximum sequence length: 2049, sample length: 4603 +[default0]:Skipping sample id=1044644. Maximum sequence length: 2049, sample length: 2367 +[default0]:Skipping sample id=662095. Maximum sequence length: 2049, sample length: 2667 +[default0]:Skipping sample id=224252. Maximum sequence length: 2049, sample length: 5097 +[default0]:Skipping sample id=1033715. Maximum sequence length: 2049, sample length: 2611 +[default0]:Skipping sample id=962163. Maximum sequence length: 2049, sample length: 4374 +[default0]:Skipping sample id=355770. Maximum sequence length: 2049, sample length: 2336 +[default0]:Skipping sample id=1028301. Maximum sequence length: 2049, sample length: 2881 +[default0]:Skipping sample id=831301. Maximum sequence length: 2049, sample length: 2725 +[default0]:Skipping sample id=294470. Maximum sequence length: 2049, sample length: 3194 +[default0]:Skipping sample id=1078820. Maximum sequence length: 2049, sample length: 2716 +[default0]:Skipping sample id=903812. Maximum sequence length: 2049, sample length: 2100 +[default0]:Skipping sample id=700871. Maximum sequence length: 2049, sample length: 2390 +[default0]:Skipping sample id=1016515. Maximum sequence length: 2049, sample length: 2111 +[default0]:Skipping sample id=1543570. Maximum sequence length: 2049, sample length: 3778 +[default0]:Skipping sample id=84238. Maximum sequence length: 2049, sample length: 2867 +[default0]:Skipping sample id=1568351. Maximum sequence length: 2049, sample length: 2428 +[default0]:Skipping sample id=705138. Maximum sequence length: 2049, sample length: 4534 +[default0]:Skipping sample id=949616. Maximum sequence length: 2049, sample length: 3043 +[default0]:Skipping sample id=960488. Maximum sequence length: 2049, sample length: 2468 +[default0]:Skipping sample id=862880. Maximum sequence length: 2049, sample length: 4609 +[default0]:Skipping sample id=50993. Maximum sequence length: 2049, sample length: 2574 +[default0]:Skipping sample id=151593. Maximum sequence length: 2049, sample length: 2125 +[default0]:Skipping sample id=553473. Maximum sequence length: 2049, sample length: 3782 +[default0]:Skipping sample id=1342983. Maximum sequence length: 2049, sample length: 2256 +[default0]:Skipping sample id=766832. Maximum sequence length: 2049, sample length: 8323 +[default0]:Skipping sample id=1227994. Maximum sequence length: 2049, sample length: 2088 +[default0]:Skipping sample id=480518. Maximum sequence length: 2049, sample length: 2504 +[default0]:Skipping sample id=64631. Maximum sequence length: 2049, sample length: 2202 +[default0]:Skipping sample id=696171. Maximum sequence length: 2049, sample length: 2156 +[default0]:Skipping sample id=1241112. Maximum sequence length: 2049, sample length: 2378 +[default0]:Skipping sample id=1437995. Maximum sequence length: 2049, sample length: 2633 +[default0]:Skipping sample id=1083676. Maximum sequence length: 2049, sample length: 3079 +[default0]:Skipping sample id=1540676. Maximum sequence length: 2049, sample length: 3407 +[default0]:Skipping sample id=951091. Maximum sequence length: 2049, sample length: 2119 +[default0]:Skipping sample id=390649. Maximum sequence length: 2049, sample length: 2689 +[default0]:Skipping sample id=58598. Maximum sequence length: 2049, sample length: 3339 +[default0]:Skipping sample id=1355516. Maximum sequence length: 2049, sample length: 2756 +[default0]:Skipping sample id=596104. Maximum sequence length: 2049, sample length: 2818 +[default0]:Skipping sample id=1106195. Maximum sequence length: 2049, sample length: 3044 +[default0]:Skipping sample id=651002. Maximum sequence length: 2049, sample length: 2268 +[default0]:Skipping sample id=767554. Maximum sequence length: 2049, sample length: 2999 +[default0]:Skipping sample id=946961. Maximum sequence length: 2049, sample length: 3269 +[default0]:Skipping sample id=163953. Maximum sequence length: 2049, sample length: 3429 +[default0]:Skipping sample id=278627. Maximum sequence length: 2049, sample length: 2625 +[default0]:Skipping sample id=1356176. Maximum sequence length: 2049, sample length: 2690 +[default0]:Skipping sample id=789754. Maximum sequence length: 2049, sample length: 3715 +[default0]:Skipping sample id=1208914. Maximum sequence length: 2049, sample length: 2166 +[default0]:Skipping sample id=1007035. Maximum sequence length: 2049, sample length: 2979 +[default0]:Skipping sample id=802065. Maximum sequence length: 2049, sample length: 3123 +[default0]:Skipping sample id=1329090. Maximum sequence length: 2049, sample length: 4229 +[default0]:Skipping sample id=347215. Maximum sequence length: 2049, sample length: 2615 +[default0]:Skipping sample id=519583. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=538589. Maximum sequence length: 2049, sample length: 2350 +[default0]:Skipping sample id=925729. Maximum sequence length: 2049, sample length: 2734 +[default0]:Skipping sample id=1243775. Maximum sequence length: 2049, sample length: 4304 +[default0]:Skipping sample id=256284. Maximum sequence length: 2049, sample length: 2739 +[default0]:Skipping sample id=736097. Maximum sequence length: 2049, sample length: 2603 +[default0]:Skipping sample id=745720. Maximum sequence length: 2049, sample length: 2364 +[default0]:Skipping sample id=1208183. Maximum sequence length: 2049, sample length: 2594 +[default0]:Skipping sample id=4404. Maximum sequence length: 2049, sample length: 3638 +[default0]:Skipping sample id=894979. Maximum sequence length: 2049, sample length: 2609 +[default0]:Skipping sample id=1342683. Maximum sequence length: 2049, sample length: 2672 +[default0]:Skipping sample id=243008. Maximum sequence length: 2049, sample length: 2445 +[default0]:Skipping sample id=1099912. Maximum sequence length: 2049, sample length: 4517 +[default0]:Skipping sample id=68978. Maximum sequence length: 2049, sample length: 2736 +[default0]:Skipping sample id=13187. Maximum sequence length: 2049, sample length: 2369 +[default0]:Skipping sample id=907038. Maximum sequence length: 2049, sample length: 2761 +[default0]:Skipping sample id=1258307. Maximum sequence length: 2049, sample length: 2578 +[default0]:Skipping sample id=251737. Maximum sequence length: 2049, sample length: 3778 +[default0]:Skipping sample id=1569695. Maximum sequence length: 2049, sample length: 2317 +[default0]:Skipping sample id=262385. Maximum sequence length: 2049, sample length: 2512 +[default0]:Skipping sample id=1435758. Maximum sequence length: 2049, sample length: 3891 +[default0]:Skipping sample id=597738. Maximum sequence length: 2049, sample length: 2104 +[default0]:Skipping sample id=1570796. Maximum sequence length: 2049, sample length: 2388 +[default0]:Skipping sample id=99242. Maximum sequence length: 2049, sample length: 2277 +[default0]:Skipping sample id=720002. Maximum sequence length: 2049, sample length: 2363 +[default0]:Skipping sample id=1005517. Maximum sequence length: 2049, sample length: 2116 +[default0]:Skipping sample id=1060114. Maximum sequence length: 2049, sample length: 2091 +[default0]:Skipping sample id=1568153. Maximum sequence length: 2049, sample length: 3377 +[default0]:Skipping sample id=1412322. Maximum sequence length: 2049, sample length: 3005 +[default0]:Skipping sample id=1039487. Maximum sequence length: 2049, sample length: 3045 +[default0]:Skipping sample id=953098. Maximum sequence length: 2049, sample length: 2618 +[default0]:Skipping sample id=1480440. Maximum sequence length: 2049, sample length: 2134 +[default0]:Skipping sample id=941457. Maximum sequence length: 2049, sample length: 2401 +[default0]:Skipping sample id=1354565. Maximum sequence length: 2049, sample length: 2400 +[default0]:Skipping sample id=1506723. Maximum sequence length: 2049, sample length: 4098 +[default0]:Skipping sample id=1004266. Maximum sequence length: 2049, sample length: 2313 +[default0]:Skipping sample id=663359. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=248590. Maximum sequence length: 2049, sample length: 2212 +[default0]:Skipping sample id=879567. Maximum sequence length: 2049, sample length: 3124 +[default0]:Skipping sample id=220117. Maximum sequence length: 2049, sample length: 2765 +[default0]:Skipping sample id=1055452. Maximum sequence length: 2049, sample length: 3312 +[default0]:Skipping sample id=1569792. Maximum sequence length: 2049, sample length: 2067 +[default0]:Skipping sample id=1258755. Maximum sequence length: 2049, sample length: 5585 +[default0]:Skipping sample id=578730. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=258243. Maximum sequence length: 2049, sample length: 2491 +[default0]:Skipping sample id=1319342. Maximum sequence length: 2049, sample length: 2526 +[default0]:Skipping sample id=885144. Maximum sequence length: 2049, sample length: 3558 +[default0]:Skipping sample id=911797. Maximum sequence length: 2049, sample length: 2441 +[default0]:Skipping sample id=1013358. Maximum sequence length: 2049, sample length: 3418 +[default0]:Skipping sample id=1105397. Maximum sequence length: 2049, sample length: 3680 +[default0]:Skipping sample id=710631. Maximum sequence length: 2049, sample length: 4290 +[default0]:Skipping sample id=1536492. Maximum sequence length: 2049, sample length: 2646 +[default0]:Skipping sample id=1506210. Maximum sequence length: 2049, sample length: 2607 +[default0]:Skipping sample id=1144868. Maximum sequence length: 2049, sample length: 6092 +[default0]:Skipping sample id=661774. Maximum sequence length: 2049, sample length: 3128 +[default0]:Skipping sample id=797870. Maximum sequence length: 2049, sample length: 3743 +[default0]:Skipping sample id=1207776. Maximum sequence length: 2049, sample length: 3500 +[default0]:Skipping sample id=963615. Maximum sequence length: 2049, sample length: 2685 +[default0]:Skipping sample id=1390487. Maximum sequence length: 2049, sample length: 3455 +[default0]:Skipping sample id=494627. Maximum sequence length: 2049, sample length: 2220 +[default0]:Skipping sample id=1202359. Maximum sequence length: 2049, sample length: 2530 +[default0]:Skipping sample id=628039. Maximum sequence length: 2049, sample length: 2375 +[default0]:Skipping sample id=396241. Maximum sequence length: 2049, sample length: 3490 +[default0]:Skipping sample id=705678. Maximum sequence length: 2049, sample length: 3787 +[default0]:Skipping sample id=797963. Maximum sequence length: 2049, sample length: 2718 +[default0]:Skipping sample id=285211. Maximum sequence length: 2049, sample length: 2137 +[default0]:Skipping sample id=1420176. Maximum sequence length: 2049, sample length: 2295 +[default0]:Skipping sample id=1331267. Maximum sequence length: 2049, sample length: 2544 +[default0]:Skipping sample id=671610. Maximum sequence length: 2049, sample length: 2516 +[default0]:Skipping sample id=1192072. Maximum sequence length: 2049, sample length: 5368 +[default0]:Skipping sample id=1300109. Maximum sequence length: 2049, sample length: 2974 +[default0]:Skipping sample id=1287992. Maximum sequence length: 2049, sample length: 3122 +[default0]:Skipping sample id=1255136. Maximum sequence length: 2049, sample length: 2805 +[default0]:Skipping sample id=890010. Maximum sequence length: 2049, sample length: 5549 +[default0]:Skipping sample id=460424. Maximum sequence length: 2049, sample length: 2995 +[default0]:Skipping sample id=1502516. Maximum sequence length: 2049, sample length: 2110 +[default0]:Skipping sample id=1471801. Maximum sequence length: 2049, sample length: 2105 +[default0]:Skipping sample id=1066932. Maximum sequence length: 2049, sample length: 2194 +[default0]:Skipping sample id=340681. Maximum sequence length: 2049, sample length: 3077 +[default0]:Skipping sample id=258905. Maximum sequence length: 2049, sample length: 2272 +[default0]:Skipping sample id=281297. Maximum sequence length: 2049, sample length: 2860 +[default0]:Skipping sample id=94819. Maximum sequence length: 2049, sample length: 3369 +[default0]:Skipping sample id=1232872. Maximum sequence length: 2049, sample length: 3058 +[default0]:Skipping sample id=322332. Maximum sequence length: 2049, sample length: 2471 +[default0]:Skipping sample id=1202331. Maximum sequence length: 2049, sample length: 2157 +[default0]:Skipping sample id=585221. Maximum sequence length: 2049, sample length: 4602 +[default0]:Skipping sample id=584975. Maximum sequence length: 2049, sample length: 3230 +[default0]:Skipping sample id=1328163. Maximum sequence length: 2049, sample length: 2774 +[default0]:Skipping sample id=992942. Maximum sequence length: 2049, sample length: 2609 +[default0]:Skipping sample id=1369791. Maximum sequence length: 2049, sample length: 2543 +[default0]:Skipping sample id=1077109. Maximum sequence length: 2049, sample length: 3263 +[default0]:Skipping sample id=96507. Maximum sequence length: 2049, sample length: 4209 +[default0]:Skipping sample id=1106529. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=1162944. Maximum sequence length: 2049, sample length: 3684 +[default0]:Skipping sample id=1465478. Maximum sequence length: 2049, sample length: 2766 +[default0]:Skipping sample id=408309. Maximum sequence length: 2049, sample length: 5651 +[default0]:Skipping sample id=1018576. Maximum sequence length: 2049, sample length: 2859 +[default0]:Skipping sample id=267425. Maximum sequence length: 2049, sample length: 2359 +[default0]:Skipping sample id=832270. Maximum sequence length: 2049, sample length: 4014 +[default0]:Skipping sample id=1536478. Maximum sequence length: 2049, sample length: 3165 +[default0]:Skipping sample id=281543. Maximum sequence length: 2049, sample length: 2280 +[default0]:Skipping sample id=79170. Maximum sequence length: 2049, sample length: 2655 +[default0]:Skipping sample id=948389. Maximum sequence length: 2049, sample length: 2086 +[default0]:Skipping sample id=253145. Maximum sequence length: 2049, sample length: 4810 +[default0]:Skipping sample id=692374. Maximum sequence length: 2049, sample length: 2636 +[default0]:Skipping sample id=690786. Maximum sequence length: 2049, sample length: 4657 +[default0]:Skipping sample id=984261. Maximum sequence length: 2049, sample length: 2293 +[default0]:Skipping sample id=922213. Maximum sequence length: 2049, sample length: 2568 +[default0]:Skipping sample id=364127. Maximum sequence length: 2049, sample length: 4467 +[default0]:Skipping sample id=1386700. Maximum sequence length: 2049, sample length: 2879 +[default0]:Skipping sample id=632286. Maximum sequence length: 2049, sample length: 2146 +[default0]:Skipping sample id=506952. Maximum sequence length: 2049, sample length: 3138 +[default0]:Skipping sample id=965810. Maximum sequence length: 2049, sample length: 3359 +[default0]:Skipping sample id=223586. Maximum sequence length: 2049, sample length: 2230 +[default0]:Skipping sample id=1003204. Maximum sequence length: 2049, sample length: 3028 +[default0]:Skipping sample id=486574. Maximum sequence length: 2049, sample length: 3315 +[default0]:Skipping sample id=252066. Maximum sequence length: 2049, sample length: 3584 +[default0]:Skipping sample id=965201. Maximum sequence length: 2049, sample length: 5254 +[default0]:Skipping sample id=1136411. Maximum sequence length: 2049, sample length: 2400 +[default0]:Skipping sample id=266273. Maximum sequence length: 2049, sample length: 2146 +[default0]:Skipping sample id=1258723. Maximum sequence length: 2049, sample length: 5783 +[default0]:Skipping sample id=1544906. Maximum sequence length: 2049, sample length: 2529 +[default0]:Skipping sample id=179930. Maximum sequence length: 2049, sample length: 4432 +[default0]:Skipping sample id=665892. Maximum sequence length: 2049, sample length: 2411 +[default0]:Skipping sample id=1288389. Maximum sequence length: 2049, sample length: 3407 +[default0]:Skipping sample id=407158. Maximum sequence length: 2049, sample length: 3155 +[default0]:Skipping sample id=1032700. Maximum sequence length: 2049, sample length: 5067 +[default0]:Skipping sample id=1116342. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=1523691. Maximum sequence length: 2049, sample length: 5007 +[default0]:Skipping sample id=683125. Maximum sequence length: 2049, sample length: 4302 +[default0]:Skipping sample id=414231. Maximum sequence length: 2049, sample length: 2846 +[default0]:Skipping sample id=508611. Maximum sequence length: 2049, sample length: 2851 +[default0]:Skipping sample id=566547. Maximum sequence length: 2049, sample length: 2312 +[default0]:Skipping sample id=90253. Maximum sequence length: 2049, sample length: 3435 +[default0]:Skipping sample id=393610. Maximum sequence length: 2049, sample length: 4666 +[default0]:Skipping sample id=1208073. Maximum sequence length: 2049, sample length: 4320 +[default0]:Skipping sample id=14328. Maximum sequence length: 2049, sample length: 2902 +[default0]:Skipping sample id=424614. Maximum sequence length: 2049, sample length: 2315 +[default0]:Skipping sample id=326497. Maximum sequence length: 2049, sample length: 3459 +[default0]:Skipping sample id=1411761. Maximum sequence length: 2049, sample length: 2890 +[default0]:Skipping sample id=315352. Maximum sequence length: 2049, sample length: 2129 +[default0]:Skipping sample id=1572317. Maximum sequence length: 2049, sample length: 3153 +[default0]:Skipping sample id=332514. Maximum sequence length: 2049, sample length: 3625 +[default0]:Skipping sample id=1204033. Maximum sequence length: 2049, sample length: 2074 +[default0]:Skipping sample id=251525. Maximum sequence length: 2049, sample length: 3732 +[default0]:Skipping sample id=1163394. Maximum sequence length: 2049, sample length: 2624 +[default0]:Skipping sample id=1057125. Maximum sequence length: 2049, sample length: 2491 +[default0]:Skipping sample id=1554503. Maximum sequence length: 2049, sample length: 2342 +[default0]:Skipping sample id=508645. Maximum sequence length: 2049, sample length: 2117 +[default0]:Skipping sample id=1177080. Maximum sequence length: 2049, sample length: 3242 +[default0]:Skipping sample id=605990. Maximum sequence length: 2049, sample length: 2369 +[default0]:Skipping sample id=932116. Maximum sequence length: 2049, sample length: 2153 +[default0]:Skipping sample id=257748. Maximum sequence length: 2049, sample length: 2552 +[default0]:Skipping sample id=639992. Maximum sequence length: 2049, sample length: 4236 +[default0]:Skipping sample id=459194. Maximum sequence length: 2049, sample length: 3263 +[default0]:Skipping sample id=499439. Maximum sequence length: 2049, sample length: 2399 +[default0]:Skipping sample id=883962. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=1212205. Maximum sequence length: 2049, sample length: 2558 +[default0]:Skipping sample id=406496. Maximum sequence length: 2049, sample length: 2833 +[default0]:Skipping sample id=1199526. Maximum sequence length: 2049, sample length: 2469 +[default0]:Skipping sample id=1213262. Maximum sequence length: 2049, sample length: 2815 +[default0]:Skipping sample id=365284. Maximum sequence length: 2049, sample length: 2437 +[default0]:Skipping sample id=205458. Maximum sequence length: 2049, sample length: 2373 +[default0]:Skipping sample id=1257271. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=365981. Maximum sequence length: 2049, sample length: 2541 +[default0]:Skipping sample id=1180967. Maximum sequence length: 2049, sample length: 2369 +[default0]:Skipping sample id=1400947. Maximum sequence length: 2049, sample length: 2092 +[default0]:Skipping sample id=1186776. Maximum sequence length: 2049, sample length: 3668 +[default0]:Skipping sample id=78370. Maximum sequence length: 2049, sample length: 3900 +[default0]:Skipping sample id=182094. Maximum sequence length: 2049, sample length: 2791 +[default0]:Skipping sample id=1174320. Maximum sequence length: 2049, sample length: 2730 +[default0]:Skipping sample id=1241288. Maximum sequence length: 2049, sample length: 3325 +[default0]:Skipping sample id=1155605. Maximum sequence length: 2049, sample length: 3316 +[default0]:Skipping sample id=1183182. Maximum sequence length: 2049, sample length: 2290 +[default0]:Skipping sample id=504644. Maximum sequence length: 2049, sample length: 2525 +[default0]:Skipping sample id=158590. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=1454004. Maximum sequence length: 2049, sample length: 2756 +[default0]:Skipping sample id=502407. Maximum sequence length: 2049, sample length: 4495 +[default0]:Skipping sample id=695583. Maximum sequence length: 2049, sample length: 2477 +[default0]:Skipping sample id=910893. Maximum sequence length: 2049, sample length: 3240 +[default0]:Skipping sample id=523234. Maximum sequence length: 2049, sample length: 4636 +[default0]:Skipping sample id=1353712. Maximum sequence length: 2049, sample length: 3145 +[default0]:Skipping sample id=1155585. Maximum sequence length: 2049, sample length: 2433 +[default0]:Skipping sample id=221957. Maximum sequence length: 2049, sample length: 2727 +[default0]:Skipping sample id=1532926. Maximum sequence length: 2049, sample length: 4428 +[default0]:Skipping sample id=412868. Maximum sequence length: 2049, sample length: 2597 +[default0]:Skipping sample id=890548. Maximum sequence length: 2049, sample length: 2542 +[default0]:Skipping sample id=871710. Maximum sequence length: 2049, sample length: 3660 +[default0]:Skipping sample id=1138312. Maximum sequence length: 2049, sample length: 2572 +[default0]:Skipping sample id=1426688. Maximum sequence length: 2049, sample length: 4362 +[default0]:Skipping sample id=430070. Maximum sequence length: 2049, sample length: 3595 +[default0]:Skipping sample id=1281139. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=416163. Maximum sequence length: 2049, sample length: 2249 +[default0]:Skipping sample id=1089734. Maximum sequence length: 2049, sample length: 6259 +[default0]:Skipping sample id=679281. Maximum sequence length: 2049, sample length: 3139 +[default0]:Skipping sample id=1475524. Maximum sequence length: 2049, sample length: 3644 +[default0]:Skipping sample id=441106. Maximum sequence length: 2049, sample length: 4102 +[default0]:Skipping sample id=925585. Maximum sequence length: 2049, sample length: 2310 +[default0]:Skipping sample id=1306274. Maximum sequence length: 2049, sample length: 2928 +[default0]:Skipping sample id=1294263. Maximum sequence length: 2049, sample length: 3027 +[default0]:Skipping sample id=1192371. Maximum sequence length: 2049, sample length: 2747 +[default0]:Skipping sample id=481900. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=796017. Maximum sequence length: 2049, sample length: 2485 +[default0]:Skipping sample id=1506320. Maximum sequence length: 2049, sample length: 4752 +[default0]:Skipping sample id=340810. Maximum sequence length: 2049, sample length: 2226 +[default0]:Skipping sample id=1296833. Maximum sequence length: 2049, sample length: 2562 +[default0]:Skipping sample id=1556377. Maximum sequence length: 2049, sample length: 3513 +[default0]:Skipping sample id=886973. Maximum sequence length: 2049, sample length: 2247 +[default0]:Skipping sample id=880107. Maximum sequence length: 2049, sample length: 2795 +[default0]:Skipping sample id=1442620. Maximum sequence length: 2049, sample length: 2674 +[default0]:Skipping sample id=370514. Maximum sequence length: 2049, sample length: 2157 +[default0]:Skipping sample id=531575. Maximum sequence length: 2049, sample length: 2200 +[default0]:Skipping sample id=1489213. Maximum sequence length: 2049, sample length: 2984 +[default0]:Skipping sample id=1462061. Maximum sequence length: 2049, sample length: 2136 +[default0]:Skipping sample id=1029414. Maximum sequence length: 2049, sample length: 2667 +[default0]:Skipping sample id=1406695. Maximum sequence length: 2049, sample length: 2508 +[default0]:Skipping sample id=1455876. Maximum sequence length: 2049, sample length: 2091 +[default0]:Skipping sample id=1058927. Maximum sequence length: 2049, sample length: 2355 +[default0]:Skipping sample id=1372750. Maximum sequence length: 2049, sample length: 2356 +[default0]:Skipping sample id=1419720. Maximum sequence length: 2049, sample length: 2502 +[default0]:Skipping sample id=307533. Maximum sequence length: 2049, sample length: 4797 +[default0]:Skipping sample id=1085440. Maximum sequence length: 2049, sample length: 2077 +[default0]:Skipping sample id=59424. Maximum sequence length: 2049, sample length: 6702 +[default0]:Skipping sample id=89982. Maximum sequence length: 2049, sample length: 2795 +[default0]:Skipping sample id=1372963. Maximum sequence length: 2049, sample length: 2333 +[default0]:Skipping sample id=426768. Maximum sequence length: 2049, sample length: 3925 +[default0]:Skipping sample id=10433. Maximum sequence length: 2049, sample length: 2605 +[default0]:Skipping sample id=1490810. Maximum sequence length: 2049, sample length: 3845 +[default0]:Skipping sample id=217787. Maximum sequence length: 2049, sample length: 2511 +[default0]:Skipping sample id=649424. Maximum sequence length: 2049, sample length: 2486 +[default0]:Skipping sample id=931353. Maximum sequence length: 2049, sample length: 6453 +[default0]:Skipping sample id=621548. Maximum sequence length: 2049, sample length: 2497 +[default0]:Skipping sample id=271254. Maximum sequence length: 2049, sample length: 2851 +[default0]:Skipping sample id=734908. Maximum sequence length: 2049, sample length: 2533 +[default0]:Skipping sample id=1047400. Maximum sequence length: 2049, sample length: 3273 +[default0]:Skipping sample id=999880. Maximum sequence length: 2049, sample length: 4232 +[default0]:Skipping sample id=1043547. Maximum sequence length: 2049, sample length: 2575 +[default0]:Skipping sample id=650139. Maximum sequence length: 2049, sample length: 2759 +[default0]:Skipping sample id=577875. Maximum sequence length: 2049, sample length: 3382 +[default0]:Skipping sample id=510378. Maximum sequence length: 2049, sample length: 2915 +[default0]:Skipping sample id=396247. Maximum sequence length: 2049, sample length: 3425 +[default0]:Skipping sample id=1304811. Maximum sequence length: 2049, sample length: 4428 +[default0]:Skipping sample id=1298223. Maximum sequence length: 2049, sample length: 2051 +[default0]:Skipping sample id=207412. Maximum sequence length: 2049, sample length: 2189 +[default0]:Skipping sample id=219231. Maximum sequence length: 2049, sample length: 2618 +[default0]:Skipping sample id=701708. Maximum sequence length: 2049, sample length: 2264 +[default0]:Skipping sample id=1381324. Maximum sequence length: 2049, sample length: 4251 +[default0]:Skipping sample id=844602. Maximum sequence length: 2049, sample length: 2427 +[default0]:Skipping sample id=1226281. Maximum sequence length: 2049, sample length: 4898 +[default0]:Skipping sample id=2168. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=1000627. Maximum sequence length: 2049, sample length: 3736 +[default0]:Skipping sample id=982726. Maximum sequence length: 2049, sample length: 2297 +[default0]:Skipping sample id=277028. Maximum sequence length: 2049, sample length: 2102 +[default0]:Skipping sample id=281613. Maximum sequence length: 2049, sample length: 2114 +[default0]:Skipping sample id=264293. Maximum sequence length: 2049, sample length: 2349 +[default0]:Skipping sample id=1604. Maximum sequence length: 2049, sample length: 2322 +[default0]:Skipping sample id=778014. Maximum sequence length: 2049, sample length: 2437 +[default0]:Skipping sample id=899644. Maximum sequence length: 2049, sample length: 3058 +[default0]:Skipping sample id=525570. Maximum sequence length: 2049, sample length: 3088 +[default0]:Skipping sample id=604944. Maximum sequence length: 2049, sample length: 2369 +[default0]:Skipping sample id=505613. Maximum sequence length: 2049, sample length: 2132 +[default0]:Skipping sample id=502116. Maximum sequence length: 2049, sample length: 3313 +[default0]:Skipping sample id=1415384. Maximum sequence length: 2049, sample length: 3666 +[default0]:Skipping sample id=471102. Maximum sequence length: 2049, sample length: 2205 +[default0]:Skipping sample id=544680. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=968009. Maximum sequence length: 2049, sample length: 2745 +[default0]:Skipping sample id=1120028. Maximum sequence length: 2049, sample length: 2612 +[default0]:Skipping sample id=682410. Maximum sequence length: 2049, sample length: 2064 +[default0]:Skipping sample id=1411046. Maximum sequence length: 2049, sample length: 3097 +[default0]:Skipping sample id=1296117. Maximum sequence length: 2049, sample length: 2798 +[default0]:Skipping sample id=648760. Maximum sequence length: 2049, sample length: 2595 +[default0]:Skipping sample id=1034839. Maximum sequence length: 2049, sample length: 2894 +[default0]:Skipping sample id=1380034. Maximum sequence length: 2049, sample length: 3587 +[default0]:Skipping sample id=321873. Maximum sequence length: 2049, sample length: 2761 +[default0]:Skipping sample id=1284708. Maximum sequence length: 2049, sample length: 2832 +[default0]:Skipping sample id=552221. Maximum sequence length: 2049, sample length: 2178 +[default0]:Skipping sample id=617534. Maximum sequence length: 2049, sample length: 2073 +[default0]:Skipping sample id=1035620. Maximum sequence length: 2049, sample length: 3202 +[default0]:Skipping sample id=880987. Maximum sequence length: 2049, sample length: 2102 +[default0]:Skipping sample id=991854. Maximum sequence length: 2049, sample length: 2212 +[default0]:Skipping sample id=501981. Maximum sequence length: 2049, sample length: 2588 +[default0]:Skipping sample id=197412. Maximum sequence length: 2049, sample length: 2111 +[default0]:Skipping sample id=311635. Maximum sequence length: 2049, sample length: 2369 +[default0]:Skipping sample id=170453. Maximum sequence length: 2049, sample length: 2473 +[default0]:Skipping sample id=839632. Maximum sequence length: 2049, sample length: 2822 +[default0]:Skipping sample id=579837. Maximum sequence length: 2049, sample length: 3366 +[default0]:Skipping sample id=214814. Maximum sequence length: 2049, sample length: 4181 +[default0]:Skipping sample id=549572. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=104360. Maximum sequence length: 2049, sample length: 3748 +[default0]:Skipping sample id=126688. Maximum sequence length: 2049, sample length: 3774 +[default0]:Skipping sample id=332074. Maximum sequence length: 2049, sample length: 2447 +[default0]:Skipping sample id=1533396. Maximum sequence length: 2049, sample length: 3063 +[default0]:Skipping sample id=1459048. Maximum sequence length: 2049, sample length: 2471 +[default0]:Skipping sample id=199571. Maximum sequence length: 2049, sample length: 3188 +[default0]:Skipping sample id=102179. Maximum sequence length: 2049, sample length: 3307 +[default0]:Skipping sample id=173153. Maximum sequence length: 2049, sample length: 2528 +[default0]:Skipping sample id=1357764. Maximum sequence length: 2049, sample length: 2380 +[default0]:Skipping sample id=897830. Maximum sequence length: 2049, sample length: 4550 +[default0]:Skipping sample id=1543136. Maximum sequence length: 2049, sample length: 4177 +[default0]:Skipping sample id=1339095. Maximum sequence length: 2049, sample length: 2747 +[default0]:Skipping sample id=330036. Maximum sequence length: 2049, sample length: 5463 +[default0]:Skipping sample id=843954. Maximum sequence length: 2049, sample length: 3505 +[default0]:Skipping sample id=1121531. Maximum sequence length: 2049, sample length: 3869 +[default0]:Skipping sample id=862182. Maximum sequence length: 2049, sample length: 2959 +[default0]:Skipping sample id=1349261. Maximum sequence length: 2049, sample length: 2307 +[default0]:Skipping sample id=761922. Maximum sequence length: 2049, sample length: 2113 +[default0]:Skipping sample id=1155342. Maximum sequence length: 2049, sample length: 4333 +[default0]:Skipping sample id=295292. Maximum sequence length: 2049, sample length: 3835 +[default0]:Skipping sample id=82523. Maximum sequence length: 2049, sample length: 2736 +[default0]:Skipping sample id=1486384. Maximum sequence length: 2049, sample length: 2756 +[default0]:Skipping sample id=143189. Maximum sequence length: 2049, sample length: 2121 +[default0]:Skipping sample id=1390857. Maximum sequence length: 2049, sample length: 2410 +[default0]:Skipping sample id=1540837. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=696664. Maximum sequence length: 2049, sample length: 3664 +[default0]:Skipping sample id=43378. Maximum sequence length: 2049, sample length: 2472 +[default0]:Skipping sample id=7819. Maximum sequence length: 2049, sample length: 3381 +[default0]:Skipping sample id=1182404. Maximum sequence length: 2049, sample length: 3956 +[default0]:Skipping sample id=450727. Maximum sequence length: 2049, sample length: 4360 +[default0]:Skipping sample id=538243. Maximum sequence length: 2049, sample length: 5544 +[default0]:Skipping sample id=716764. Maximum sequence length: 2049, sample length: 5350 +[default0]:Skipping sample id=533767. Maximum sequence length: 2049, sample length: 2132 +[default0]:Skipping sample id=925900. Maximum sequence length: 2049, sample length: 2244 +[default0]:Skipping sample id=1250745. Maximum sequence length: 2049, sample length: 2203 +[default0]:Skipping sample id=449296. Maximum sequence length: 2049, sample length: 5234 +[default0]:Skipping sample id=3121. Maximum sequence length: 2049, sample length: 4802 +[default0]:Skipping sample id=800571. Maximum sequence length: 2049, sample length: 3658 +[default0]:Skipping sample id=1310916. Maximum sequence length: 2049, sample length: 2881 +[default0]:Skipping sample id=245812. Maximum sequence length: 2049, sample length: 2222 +[default0]:Skipping sample id=1152783. Maximum sequence length: 2049, sample length: 2064 +[default0]:Skipping sample id=43471. Maximum sequence length: 2049, sample length: 2140 +[default0]:Skipping sample id=136551. Maximum sequence length: 2049, sample length: 2418 +[default0]:Skipping sample id=1199117. Maximum sequence length: 2049, sample length: 2418 +[default0]:Skipping sample id=1071259. Maximum sequence length: 2049, sample length: 5125 +[default0]:Skipping sample id=1077159. Maximum sequence length: 2049, sample length: 4346 +[default0]:Skipping sample id=846194. Maximum sequence length: 2049, sample length: 3724 +[default0]:Skipping sample id=748551. Maximum sequence length: 2049, sample length: 4460 +[default0]:Skipping sample id=860148. Maximum sequence length: 2049, sample length: 2231 +[default0]:Skipping sample id=705748. Maximum sequence length: 2049, sample length: 2285 +[default0]:Skipping sample id=1210073. Maximum sequence length: 2049, sample length: 2941 +[default0]:Skipping sample id=1111200. Maximum sequence length: 2049, sample length: 2404 +[default0]:Skipping sample id=887331. Maximum sequence length: 2049, sample length: 2166 +[default0]:Skipping sample id=826903. Maximum sequence length: 2049, sample length: 2441 +[default0]:Skipping sample id=210667. Maximum sequence length: 2049, sample length: 3354 +[default0]:Skipping sample id=915577. Maximum sequence length: 2049, sample length: 2340 +[default0]:Skipping sample id=140762. Maximum sequence length: 2049, sample length: 3517 +[default0]:Skipping sample id=322356. Maximum sequence length: 2049, sample length: 3323 +[default0]:Skipping sample id=625301. Maximum sequence length: 2049, sample length: 3466 +[default0]:Skipping sample id=435447. Maximum sequence length: 2049, sample length: 2697 +[default0]:Skipping sample id=1412736. Maximum sequence length: 2049, sample length: 2831 +[default0]:Skipping sample id=1426311. Maximum sequence length: 2049, sample length: 3801 +[default0]:Skipping sample id=838402. Maximum sequence length: 2049, sample length: 2223 +[default0]:Skipping sample id=53766. Maximum sequence length: 2049, sample length: 3077 +[default0]:Skipping sample id=553050. Maximum sequence length: 2049, sample length: 2596 +[default0]:Skipping sample id=50474. Maximum sequence length: 2049, sample length: 2299 +[default0]:Skipping sample id=903410. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=433703. Maximum sequence length: 2049, sample length: 2626 +[default0]:Skipping sample id=1537593. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=1532905. Maximum sequence length: 2049, sample length: 2214 +[default0]:Skipping sample id=414732. Maximum sequence length: 2049, sample length: 2717 +[default0]:Skipping sample id=939477. Maximum sequence length: 2049, sample length: 2447 +[default0]:Skipping sample id=390596. Maximum sequence length: 2049, sample length: 3417 +[default0]:Skipping sample id=416078. Maximum sequence length: 2049, sample length: 5231 +[default0]:Skipping sample id=705524. Maximum sequence length: 2049, sample length: 2198 +[default0]:Skipping sample id=49138. Maximum sequence length: 2049, sample length: 2354 +[default0]:Skipping sample id=1185846. Maximum sequence length: 2049, sample length: 2085 +[default0]:Skipping sample id=929874. Maximum sequence length: 2049, sample length: 2265 +[default0]:Skipping sample id=224970. Maximum sequence length: 2049, sample length: 2352 +[default0]:Skipping sample id=63320. Maximum sequence length: 2049, sample length: 3588 +[default0]:Skipping sample id=1118491. Maximum sequence length: 2049, sample length: 4509 +[default0]:Skipping sample id=577507. Maximum sequence length: 2049, sample length: 4737 +[default0]:Skipping sample id=627656. Maximum sequence length: 2049, sample length: 2616 +[default0]:Skipping sample id=1032461. Maximum sequence length: 2049, sample length: 3242 +[default0]:Skipping sample id=420704. Maximum sequence length: 2049, sample length: 2376 +[default0]:Skipping sample id=29916. Maximum sequence length: 2049, sample length: 2946 +[default0]:Skipping sample id=345936. Maximum sequence length: 2049, sample length: 2149 +[default0]:Skipping sample id=673664. Maximum sequence length: 2049, sample length: 2274 +[default0]:Skipping sample id=1091949. Maximum sequence length: 2049, sample length: 2449 +[default0]:Skipping sample id=599595. Maximum sequence length: 2049, sample length: 2580 +[default0]:Skipping sample id=95560. Maximum sequence length: 2049, sample length: 2811 +[default0]:Skipping sample id=1517793. Maximum sequence length: 2049, sample length: 5242 +[default0]:Skipping sample id=104159. Maximum sequence length: 2049, sample length: 4181 +[default0]:Skipping sample id=678395. Maximum sequence length: 2049, sample length: 2838 +[default0]:Skipping sample id=1280048. Maximum sequence length: 2049, sample length: 2434 +[default0]:Skipping sample id=500084. Maximum sequence length: 2049, sample length: 2246 +[default0]:Skipping sample id=649507. Maximum sequence length: 2049, sample length: 3337 +[default0]:Skipping sample id=317697. Maximum sequence length: 2049, sample length: 2134 +[default0]:Skipping sample id=1165078. Maximum sequence length: 2049, sample length: 4611 +[default0]:Skipping sample id=323622. Maximum sequence length: 2049, sample length: 2357 +[default0]:Skipping sample id=164979. Maximum sequence length: 2049, sample length: 4683 +[default0]:Skipping sample id=698010. Maximum sequence length: 2049, sample length: 2557 +[default0]:Skipping sample id=1026262. Maximum sequence length: 2049, sample length: 2057 +[default0]:Skipping sample id=868361. Maximum sequence length: 2049, sample length: 2060 +[default0]:Skipping sample id=1525154. Maximum sequence length: 2049, sample length: 3472 +[default0]:Skipping sample id=6169. Maximum sequence length: 2049, sample length: 2583 +[default0]:Skipping sample id=963768. Maximum sequence length: 2049, sample length: 3558 +[default0]:Skipping sample id=975442. Maximum sequence length: 2049, sample length: 2742 +[default0]:Skipping sample id=1436330. Maximum sequence length: 2049, sample length: 5127 +[default0]:Skipping sample id=1399555. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=918347. Maximum sequence length: 2049, sample length: 4245 +[default0]:Skipping sample id=962386. Maximum sequence length: 2049, sample length: 3120 +[default0]:Skipping sample id=883798. Maximum sequence length: 2049, sample length: 3916 +[default0]:Skipping sample id=1232988. Maximum sequence length: 2049, sample length: 2789 +[default0]:Skipping sample id=290156. Maximum sequence length: 2049, sample length: 2541 +[default0]:Skipping sample id=488862. Maximum sequence length: 2049, sample length: 5181 +[default0]:Skipping sample id=573772. Maximum sequence length: 2049, sample length: 2208 +[default0]:Skipping sample id=1539459. Maximum sequence length: 2049, sample length: 2592 +[default0]:Skipping sample id=444902. Maximum sequence length: 2049, sample length: 2186 +[default0]:Skipping sample id=1092993. Maximum sequence length: 2049, sample length: 4003 +[default0]:Skipping sample id=683623. Maximum sequence length: 2049, sample length: 2176 +[default0]:Skipping sample id=711910. Maximum sequence length: 2049, sample length: 3198 +[default0]:Skipping sample id=150224. Maximum sequence length: 2049, sample length: 2425 +[default0]:Skipping sample id=460704. Maximum sequence length: 2049, sample length: 3297 +[default0]:Skipping sample id=1554367. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=950857. Maximum sequence length: 2049, sample length: 2402 +[default0]:Skipping sample id=11766. Maximum sequence length: 2049, sample length: 3142 +[default0]:Skipping sample id=132458. Maximum sequence length: 2049, sample length: 2873 +[default0]:Skipping sample id=383186. Maximum sequence length: 2049, sample length: 2212 +[default0]:Skipping sample id=824196. Maximum sequence length: 2049, sample length: 2782 +[default0]:Skipping sample id=166679. Maximum sequence length: 2049, sample length: 2111 +[default0]:Skipping sample id=311898. Maximum sequence length: 2049, sample length: 2105 +[default0]:Skipping sample id=306116. Maximum sequence length: 2049, sample length: 2163 +[default0]:Skipping sample id=1238569. Maximum sequence length: 2049, sample length: 4245 +[default0]:Skipping sample id=1234800. Maximum sequence length: 2049, sample length: 2859 +[default0]:Skipping sample id=284568. Maximum sequence length: 2049, sample length: 2091 +[default0]:Skipping sample id=100480. Maximum sequence length: 2049, sample length: 2290 +[default0]:Skipping sample id=559754. Maximum sequence length: 2049, sample length: 3172 +[default0]:Skipping sample id=1193106. Maximum sequence length: 2049, sample length: 2269 +[default0]:Skipping sample id=618860. Maximum sequence length: 2049, sample length: 3579 +[default0]:Skipping sample id=1542496. Maximum sequence length: 2049, sample length: 3985 +[default0]:Skipping sample id=377302. Maximum sequence length: 2049, sample length: 3466 +[default0]:Skipping sample id=290939. Maximum sequence length: 2049, sample length: 3455 +[default0]:Skipping sample id=553029. Maximum sequence length: 2049, sample length: 3069 +[default0]:Skipping sample id=527279. Maximum sequence length: 2049, sample length: 2650 +[default0]:Skipping sample id=1503476. Maximum sequence length: 2049, sample length: 2468 +[default0]:Skipping sample id=1086981. Maximum sequence length: 2049, sample length: 6550 +[default0]:Skipping sample id=1042217. Maximum sequence length: 2049, sample length: 4139 +[default0]:Skipping sample id=1122962. Maximum sequence length: 2049, sample length: 4440 +[default0]:Skipping sample id=1163918. Maximum sequence length: 2049, sample length: 2229 +[default0]:Skipping sample id=1562612. Maximum sequence length: 2049, sample length: 2806 +[default0]:Skipping sample id=352610. Maximum sequence length: 2049, sample length: 2661 +[default0]:Skipping sample id=420554. Maximum sequence length: 2049, sample length: 2621 +[default0]:Skipping sample id=1370130. Maximum sequence length: 2049, sample length: 4060 +[default0]:Skipping sample id=1507329. Maximum sequence length: 2049, sample length: 2638 +[default0]:Skipping sample id=240299. Maximum sequence length: 2049, sample length: 2094 +[default0]:Skipping sample id=1428140. Maximum sequence length: 2049, sample length: 2435 +[default0]:Skipping sample id=390940. Maximum sequence length: 2049, sample length: 2126 +[default0]:Skipping sample id=833462. Maximum sequence length: 2049, sample length: 2100 +[default0]:Skipping sample id=1060307. Maximum sequence length: 2049, sample length: 3250 +[default0]:Skipping sample id=109156. Maximum sequence length: 2049, sample length: 2576 +[default0]:Skipping sample id=892936. Maximum sequence length: 2049, sample length: 2956 +[default0]:Skipping sample id=730765. Maximum sequence length: 2049, sample length: 2428 +[default0]:Skipping sample id=1544570. Maximum sequence length: 2049, sample length: 2141 +[default0]:Skipping sample id=877296. Maximum sequence length: 2049, sample length: 2066 +[default0]:Skipping sample id=1540605. Maximum sequence length: 2049, sample length: 3588 +[default0]:Skipping sample id=312911. Maximum sequence length: 2049, sample length: 2338 +[default0]:Skipping sample id=1094176. Maximum sequence length: 2049, sample length: 4659 +[default0]:Skipping sample id=1081235. Maximum sequence length: 2049, sample length: 2171 +[default0]:Skipping sample id=910849. Maximum sequence length: 2049, sample length: 2195 +[default0]:Skipping sample id=687928. Maximum sequence length: 2049, sample length: 2100 +[default0]:Skipping sample id=141193. Maximum sequence length: 2049, sample length: 2250 +[default0]:Skipping sample id=745912. Maximum sequence length: 2049, sample length: 2468 +[default0]:Skipping sample id=475960. Maximum sequence length: 2049, sample length: 2805 +[default0]:Skipping sample id=589746. Maximum sequence length: 2049, sample length: 3374 +[default0]:Skipping sample id=905251. Maximum sequence length: 2049, sample length: 3179 +[default0]:Skipping sample id=397280. Maximum sequence length: 2049, sample length: 2589 +[default0]:Skipping sample id=1070315. Maximum sequence length: 2049, sample length: 3635 +[default0]:Skipping sample id=973350. Maximum sequence length: 2049, sample length: 2751 +[default0]:Skipping sample id=1467213. Maximum sequence length: 2049, sample length: 2517 +[default0]:Skipping sample id=1367174. Maximum sequence length: 2049, sample length: 2442 +[default0]:Skipping sample id=1080949. Maximum sequence length: 2049, sample length: 2458 +[default0]:Skipping sample id=726531. Maximum sequence length: 2049, sample length: 3455 +[default0]:Skipping sample id=283032. Maximum sequence length: 2049, sample length: 2592 +[default0]:Skipping sample id=643758. Maximum sequence length: 2049, sample length: 3278 +[default0]:Skipping sample id=1308874. Maximum sequence length: 2049, sample length: 3554 +[default0]:Skipping sample id=17868. Maximum sequence length: 2049, sample length: 2257 +[default0]:Skipping sample id=1508232. Maximum sequence length: 2049, sample length: 2514 +[default0]:Skipping sample id=688650. Maximum sequence length: 2049, sample length: 2178 +[default0]:Skipping sample id=1108659. Maximum sequence length: 2049, sample length: 3508 +[default0]:Skipping sample id=50068. Maximum sequence length: 2049, sample length: 3020 +[default0]:Skipping sample id=885152. Maximum sequence length: 2049, sample length: 3297 +[default0]:Skipping sample id=254440. Maximum sequence length: 2049, sample length: 2309 +[default0]:Skipping sample id=33565. Maximum sequence length: 2049, sample length: 3031 +[default0]:Skipping sample id=1143787. Maximum sequence length: 2049, sample length: 2212 +[default0]:Skipping sample id=1377751. Maximum sequence length: 2049, sample length: 2257 +[default0]:Skipping sample id=5120. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=1051862. Maximum sequence length: 2049, sample length: 2835 +[default0]:Skipping sample id=1135687. Maximum sequence length: 2049, sample length: 2911 +[default0]:Skipping sample id=204894. Maximum sequence length: 2049, sample length: 3345 +[default0]:Skipping sample id=714637. Maximum sequence length: 2049, sample length: 2186 +[default0]:Skipping sample id=428104. Maximum sequence length: 2049, sample length: 2469 +[default0]:Skipping sample id=480149. Maximum sequence length: 2049, sample length: 2523 +[default0]:Skipping sample id=246802. Maximum sequence length: 2049, sample length: 4250 +[default0]:Skipping sample id=1394418. Maximum sequence length: 2049, sample length: 3467 +[default0]:Skipping sample id=337574. Maximum sequence length: 2049, sample length: 2770 +[default0]:Skipping sample id=263241. Maximum sequence length: 2049, sample length: 2240 +[default0]:Skipping sample id=1356456. Maximum sequence length: 2049, sample length: 2092 +[default0]:Skipping sample id=122631. Maximum sequence length: 2049, sample length: 5832 +[default0]:Skipping sample id=610183. Maximum sequence length: 2049, sample length: 3609 +[default0]:Skipping sample id=823783. Maximum sequence length: 2049, sample length: 2376 +[default0]:Skipping sample id=1552869. Maximum sequence length: 2049, sample length: 3210 +[default0]:Skipping sample id=732542. Maximum sequence length: 2049, sample length: 3243 +[default0]:Skipping sample id=61363. Maximum sequence length: 2049, sample length: 2937 +[default0]:Skipping sample id=980100. Maximum sequence length: 2049, sample length: 4593 +[default0]:Skipping sample id=35367. Maximum sequence length: 2049, sample length: 2232 +[default0]:Skipping sample id=283734. Maximum sequence length: 2049, sample length: 2148 +[default0]:Skipping sample id=748108. Maximum sequence length: 2049, sample length: 2148 +[default0]:Skipping sample id=820732. Maximum sequence length: 2049, sample length: 2939 +[default0]:Skipping sample id=704596. Maximum sequence length: 2049, sample length: 3618 +[default0]:Skipping sample id=405348. Maximum sequence length: 2049, sample length: 3428 +[default0]:Skipping sample id=1238134. Maximum sequence length: 2049, sample length: 2740 +[default0]:Skipping sample id=1478777. Maximum sequence length: 2049, sample length: 2200 +[default0]:Skipping sample id=1341537. Maximum sequence length: 2049, sample length: 3535 +[default0]:Skipping sample id=1374093. Maximum sequence length: 2049, sample length: 2606 +[default0]:Skipping sample id=588485. Maximum sequence length: 2049, sample length: 3070 +[default0]:Skipping sample id=151295. Maximum sequence length: 2049, sample length: 2713 +[default0]:Skipping sample id=781620. Maximum sequence length: 2049, sample length: 4199 +[default0]:Skipping sample id=29491. Maximum sequence length: 2049, sample length: 2778 +[default0]:Skipping sample id=434863. Maximum sequence length: 2049, sample length: 2215 +[default0]:Skipping sample id=1131846. Maximum sequence length: 2049, sample length: 2385 +[default0]:Skipping sample id=1536366. Maximum sequence length: 2049, sample length: 2244 +[default0]:Skipping sample id=1286213. Maximum sequence length: 2049, sample length: 2577 +[default0]:Skipping sample id=159593. Maximum sequence length: 2049, sample length: 2731 +[default0]:Skipping sample id=1220440. Maximum sequence length: 2049, sample length: 2952 +[default0]:Skipping sample id=688779. Maximum sequence length: 2049, sample length: 4357 +[default0]:Skipping sample id=423631. Maximum sequence length: 2049, sample length: 2241 +[default0]:Skipping sample id=1285545. Maximum sequence length: 2049, sample length: 2074 +[default0]:Skipping sample id=166590. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=1243976. Maximum sequence length: 2049, sample length: 2258 +[default0]:Skipping sample id=356634. Maximum sequence length: 2049, sample length: 2443 +[default0]:Skipping sample id=1212881. Maximum sequence length: 2049, sample length: 2396 +[default0]:Skipping sample id=1156834. Maximum sequence length: 2049, sample length: 3473 +[default0]:Skipping sample id=509268. Maximum sequence length: 2049, sample length: 2718 +[default0]:Skipping sample id=378874. Maximum sequence length: 2049, sample length: 2468 +[default0]:Skipping sample id=822205. Maximum sequence length: 2049, sample length: 2086 +[default0]:Skipping sample id=374773. Maximum sequence length: 2049, sample length: 4240 +[default0]:Skipping sample id=1226395. Maximum sequence length: 2049, sample length: 4063 +[default0]:Skipping sample id=512885. Maximum sequence length: 2049, sample length: 3498 +[default0]:Skipping sample id=449389. Maximum sequence length: 2049, sample length: 2428 +[default0]:Skipping sample id=1343027. Maximum sequence length: 2049, sample length: 2130 +[default0]:Skipping sample id=1555400. Maximum sequence length: 2049, sample length: 2339 +[default0]:Skipping sample id=478179. Maximum sequence length: 2049, sample length: 3007 +[default0]:Skipping sample id=919043. Maximum sequence length: 2049, sample length: 2205 +[default0]:Skipping sample id=743364. Maximum sequence length: 2049, sample length: 6917 +[default0]:Skipping sample id=1107558. Maximum sequence length: 2049, sample length: 2518 +[default0]:Skipping sample id=1466049. Maximum sequence length: 2049, sample length: 2220 +[default0]:Skipping sample id=485393. Maximum sequence length: 2049, sample length: 2341 +[default0]:Skipping sample id=204308. Maximum sequence length: 2049, sample length: 2374 +[default0]:Skipping sample id=1432605. Maximum sequence length: 2049, sample length: 3746 +[default0]:Skipping sample id=272254. Maximum sequence length: 2049, sample length: 2816 +[default0]:Skipping sample id=860193. Maximum sequence length: 2049, sample length: 2117 +[default0]:Skipping sample id=958624. Maximum sequence length: 2049, sample length: 2155 +[default0]:Skipping sample id=676056. Maximum sequence length: 2049, sample length: 2309 +[default0]:Skipping sample id=521558. Maximum sequence length: 2049, sample length: 2553 +[default0]:Skipping sample id=843839. Maximum sequence length: 2049, sample length: 2224 +[default0]:Skipping sample id=687391. Maximum sequence length: 2049, sample length: 2129 +[default0]:Skipping sample id=1568451. Maximum sequence length: 2049, sample length: 2335 +[default0]:Skipping sample id=1131240. Maximum sequence length: 2049, sample length: 4609 +[default0]:Skipping sample id=562973. Maximum sequence length: 2049, sample length: 2221 +[default0]:Skipping sample id=995729. Maximum sequence length: 2049, sample length: 2404 +[default0]:Skipping sample id=1524218. Maximum sequence length: 2049, sample length: 2063 +[default0]:Skipping sample id=1132189. Maximum sequence length: 2049, sample length: 2063 +[default0]:Skipping sample id=777925. Maximum sequence length: 2049, sample length: 2387 +[default0]:Skipping sample id=1259429. Maximum sequence length: 2049, sample length: 2225 +[default0]:Skipping sample id=193563. Maximum sequence length: 2049, sample length: 3563 +[default0]:Skipping sample id=944071. Maximum sequence length: 2049, sample length: 2550 +[default0]:Skipping sample id=247833. Maximum sequence length: 2049, sample length: 2201 +[default0]:Skipping sample id=591571. Maximum sequence length: 2049, sample length: 2306 +[default0]:Skipping sample id=1491946. Maximum sequence length: 2049, sample length: 2482 +[default0]:Skipping sample id=1494346. Maximum sequence length: 2049, sample length: 2555 +[default0]:Skipping sample id=892167. Maximum sequence length: 2049, sample length: 2705 +[default0]:Skipping sample id=1245376. Maximum sequence length: 2049, sample length: 2772 +[default0]:Skipping sample id=220626. Maximum sequence length: 2049, sample length: 3485 +[default0]:Skipping sample id=468362. Maximum sequence length: 2049, sample length: 2475 +[default0]:Skipping sample id=303340. Maximum sequence length: 2049, sample length: 3186 +[default0]:Skipping sample id=599326. Maximum sequence length: 2049, sample length: 2479 +[default0]:Skipping sample id=343346. Maximum sequence length: 2049, sample length: 4756 +[default0]:Skipping sample id=1087809. Maximum sequence length: 2049, sample length: 2167 +[default0]:Skipping sample id=419926. Maximum sequence length: 2049, sample length: 2635 +[default0]:Skipping sample id=579923. Maximum sequence length: 2049, sample length: 2569 +[default0]:Skipping sample id=1280181. Maximum sequence length: 2049, sample length: 5024 +[default0]:Skipping sample id=841297. Maximum sequence length: 2049, sample length: 3082 +[default0]:Skipping sample id=1099019. Maximum sequence length: 2049, sample length: 2134 +[default0]:Skipping sample id=834290. Maximum sequence length: 2049, sample length: 2592 +[default0]:Skipping sample id=96770. Maximum sequence length: 2049, sample length: 2766 +[default0]:Skipping sample id=892702. Maximum sequence length: 2049, sample length: 3524 +[default0]:Skipping sample id=6157. Maximum sequence length: 2049, sample length: 3051 +[default0]:Skipping sample id=545533. Maximum sequence length: 2049, sample length: 2402 +[default0]:Skipping sample id=1446765. Maximum sequence length: 2049, sample length: 4344 +[default0]:Skipping sample id=420449. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=228169. Maximum sequence length: 2049, sample length: 3481 +[default0]:Skipping sample id=609889. Maximum sequence length: 2049, sample length: 4382 +[default0]:Skipping sample id=782134. Maximum sequence length: 2049, sample length: 2207 +[default0]:Skipping sample id=1528443. Maximum sequence length: 2049, sample length: 2644 +[default0]:Skipping sample id=696017. Maximum sequence length: 2049, sample length: 3308 +[default0]:Skipping sample id=299332. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=1533241. Maximum sequence length: 2049, sample length: 2327 +[default0]:Skipping sample id=1424741. Maximum sequence length: 2049, sample length: 2195 +[default0]:Skipping sample id=70558. Maximum sequence length: 2049, sample length: 2251 +[default0]:Skipping sample id=1467868. Maximum sequence length: 2049, sample length: 2352 +[default0]:Skipping sample id=526864. Maximum sequence length: 2049, sample length: 2649 +[default0]:Skipping sample id=1497281. Maximum sequence length: 2049, sample length: 2126 +[default0]:Skipping sample id=1504240. Maximum sequence length: 2049, sample length: 2895 +[default0]:Skipping sample id=915642. Maximum sequence length: 2049, sample length: 3051 +[default0]:Skipping sample id=1148384. Maximum sequence length: 2049, sample length: 2058 +[default0]:Skipping sample id=1119572. Maximum sequence length: 2049, sample length: 2361 +[default0]:Skipping sample id=1180395. Maximum sequence length: 2049, sample length: 3364 +[default0]:Skipping sample id=154691. Maximum sequence length: 2049, sample length: 2056 +[default0]:Skipping sample id=325980. Maximum sequence length: 2049, sample length: 4671 +[default0]:Skipping sample id=981898. Maximum sequence length: 2049, sample length: 2198 +[default0]:Skipping sample id=880763. Maximum sequence length: 2049, sample length: 3121 +[default0]:Skipping sample id=1531801. Maximum sequence length: 2049, sample length: 2511 +[default0]:Skipping sample id=863139. Maximum sequence length: 2049, sample length: 4699 +[default0]:Skipping sample id=252981. Maximum sequence length: 2049, sample length: 3271 +[default0]:Skipping sample id=1189904. Maximum sequence length: 2049, sample length: 3549 +[default0]:Skipping sample id=770312. Maximum sequence length: 2049, sample length: 2668 +[default0]:Skipping sample id=1401373. Maximum sequence length: 2049, sample length: 2431 +[default0]:Skipping sample id=573391. Maximum sequence length: 2049, sample length: 2472 +[default0]:Skipping sample id=782498. Maximum sequence length: 2049, sample length: 4489 +[default0]:Skipping sample id=290605. Maximum sequence length: 2049, sample length: 2365 +[default0]:Skipping sample id=259329. Maximum sequence length: 2049, sample length: 3646 +[default0]:Skipping sample id=587794. Maximum sequence length: 2049, sample length: 2395 +[default0]:Skipping sample id=1529864. Maximum sequence length: 2049, sample length: 3998 +[default0]:Skipping sample id=1110507. Maximum sequence length: 2049, sample length: 2265 +[default0]:Skipping sample id=580058. Maximum sequence length: 2049, sample length: 2441 +[default0]:Skipping sample id=641696. Maximum sequence length: 2049, sample length: 2296 +[default0]:Skipping sample id=1192815. Maximum sequence length: 2049, sample length: 2791 +[default0]:Skipping sample id=259403. Maximum sequence length: 2049, sample length: 2300 +[default0]:Skipping sample id=1118149. Maximum sequence length: 2049, sample length: 3352 +[default0]:Skipping sample id=1428224. Maximum sequence length: 2049, sample length: 2471 +[default0]:Skipping sample id=138931. Maximum sequence length: 2049, sample length: 3294 +[default0]:Skipping sample id=283122. Maximum sequence length: 2049, sample length: 4350 +[default0]:Skipping sample id=1003809. Maximum sequence length: 2049, sample length: 2920 +[default0]:Skipping sample id=12229. Maximum sequence length: 2049, sample length: 2217 +[default0]:Skipping sample id=80842. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=650059. Maximum sequence length: 2049, sample length: 3511 +[default0]:Skipping sample id=465602. Maximum sequence length: 2049, sample length: 2477 +[default0]:Skipping sample id=1551272. Maximum sequence length: 2049, sample length: 3237 +[default0]:Skipping sample id=1555479. Maximum sequence length: 2049, sample length: 3080 +[default0]:Skipping sample id=1273599. Maximum sequence length: 2049, sample length: 2765 +[default0]:Skipping sample id=626480. Maximum sequence length: 2049, sample length: 2802 +[default0]:Skipping sample id=1013657. Maximum sequence length: 2049, sample length: 2762 +[default0]:Skipping sample id=157301. Maximum sequence length: 2049, sample length: 2421 +[default0]:Skipping sample id=1491505. Maximum sequence length: 2049, sample length: 2661 +[default0]:Skipping sample id=970623. Maximum sequence length: 2049, sample length: 3269 +[default0]:Skipping sample id=35169. Maximum sequence length: 2049, sample length: 2185 +[default0]:Skipping sample id=805050. Maximum sequence length: 2049, sample length: 3059 +[default0]:Skipping sample id=1371875. Maximum sequence length: 2049, sample length: 2644 +[default0]:Skipping sample id=844017. Maximum sequence length: 2049, sample length: 2219 +[default0]:Skipping sample id=275264. Maximum sequence length: 2049, sample length: 3376 +[default0]:Skipping sample id=1112744. Maximum sequence length: 2049, sample length: 3205 +[default0]:Skipping sample id=334400. Maximum sequence length: 2049, sample length: 2587 +[default0]:Skipping sample id=458916. Maximum sequence length: 2049, sample length: 2514 +[default0]:Skipping sample id=1513132. Maximum sequence length: 2049, sample length: 2862 +[default0]:Skipping sample id=757149. Maximum sequence length: 2049, sample length: 2350 +[default0]:Skipping sample id=674888. Maximum sequence length: 2049, sample length: 2399 +[default0]:Skipping sample id=669131. Maximum sequence length: 2049, sample length: 2128 +[default0]:Skipping sample id=659594. Maximum sequence length: 2049, sample length: 2507 +[default0]:Skipping sample id=573176. Maximum sequence length: 2049, sample length: 4739 +[default0]:Skipping sample id=1543032. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=811220. Maximum sequence length: 2049, sample length: 2300 +[default0]:Skipping sample id=652292. Maximum sequence length: 2049, sample length: 4740 +[default0]:Skipping sample id=781562. Maximum sequence length: 2049, sample length: 2141 +[default0]:Skipping sample id=1336276. Maximum sequence length: 2049, sample length: 3001 +[default0]:Skipping sample id=736691. Maximum sequence length: 2049, sample length: 2089 +[default0]:Skipping sample id=295010. Maximum sequence length: 2049, sample length: 2274 +[default0]:Skipping sample id=1063745. Maximum sequence length: 2049, sample length: 3103 +[default0]:Skipping sample id=744681. Maximum sequence length: 2049, sample length: 2216 +[default0]:Skipping sample id=894429. Maximum sequence length: 2049, sample length: 2927 +[default0]:Skipping sample id=847853. Maximum sequence length: 2049, sample length: 3508 +[default0]:Skipping sample id=217222. Maximum sequence length: 2049, sample length: 2974 +[default0]:Skipping sample id=288948. Maximum sequence length: 2049, sample length: 2290 +[default0]:Skipping sample id=1441405. Maximum sequence length: 2049, sample length: 3153 +[default0]:Skipping sample id=1257063. Maximum sequence length: 2049, sample length: 3947 +[default0]:Skipping sample id=1062501. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=1493795. Maximum sequence length: 2049, sample length: 2962 +[default0]:Skipping sample id=378376. Maximum sequence length: 2049, sample length: 2156 +[default0]:Skipping sample id=293268. Maximum sequence length: 2049, sample length: 4035 +[default0]:Skipping sample id=609543. Maximum sequence length: 2049, sample length: 3551 +[default0]:Skipping sample id=264019. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=701141. Maximum sequence length: 2049, sample length: 2089 +[default0]:Skipping sample id=677477. Maximum sequence length: 2049, sample length: 2665 +[default0]:Skipping sample id=1125866. Maximum sequence length: 2049, sample length: 2090 +[default0]:Skipping sample id=50997. Maximum sequence length: 2049, sample length: 2164 +[default0]:Skipping sample id=656900. Maximum sequence length: 2049, sample length: 2389 +[default0]:Skipping sample id=16918. Maximum sequence length: 2049, sample length: 3200 +[default0]:Skipping sample id=641178. Maximum sequence length: 2049, sample length: 2094 +[default0]:Skipping sample id=256887. Maximum sequence length: 2049, sample length: 2618 +[default0]:Skipping sample id=495008. Maximum sequence length: 2049, sample length: 3098 +[default0]:Skipping sample id=1286938. Maximum sequence length: 2049, sample length: 3168 +[default0]:Skipping sample id=1534772. Maximum sequence length: 2049, sample length: 2735 +[default0]:Skipping sample id=1535360. Maximum sequence length: 2049, sample length: 2214 +[default0]:Skipping sample id=1463013. Maximum sequence length: 2049, sample length: 2637 +[default0]:Skipping sample id=1236152. Maximum sequence length: 2049, sample length: 2671 +[default0]:Skipping sample id=1097876. Maximum sequence length: 2049, sample length: 3601 +[default0]:Skipping sample id=1112302. Maximum sequence length: 2049, sample length: 2166 +[default0]:Skipping sample id=476408. Maximum sequence length: 2049, sample length: 4111 +[default0]:Skipping sample id=1045713. Maximum sequence length: 2049, sample length: 2388 +[default0]:Skipping sample id=1529449. Maximum sequence length: 2049, sample length: 2307 +[default0]:Skipping sample id=1084698. Maximum sequence length: 2049, sample length: 4225 +[default0]:Skipping sample id=1435243. Maximum sequence length: 2049, sample length: 4771 +[default0]:Skipping sample id=430086. Maximum sequence length: 2049, sample length: 2415 +[default0]:Skipping sample id=959418. Maximum sequence length: 2049, sample length: 3124 +[default0]:Skipping sample id=623724. Maximum sequence length: 2049, sample length: 3114 +[default0]:Skipping sample id=1011398. Maximum sequence length: 2049, sample length: 3091 +[default0]:Skipping sample id=919409. Maximum sequence length: 2049, sample length: 2921 +[default0]:Skipping sample id=697667. Maximum sequence length: 2049, sample length: 2727 +[default0]:Skipping sample id=639. Maximum sequence length: 2049, sample length: 2582 +[default0]:Skipping sample id=827641. Maximum sequence length: 2049, sample length: 3134 +[default0]:Skipping sample id=1332449. Maximum sequence length: 2049, sample length: 3079 +[default0]:Skipping sample id=427822. Maximum sequence length: 2049, sample length: 2676 +[default0]:Skipping sample id=811740. Maximum sequence length: 2049, sample length: 2202 +[default0]:Skipping sample id=375826. Maximum sequence length: 2049, sample length: 2266 +[default0]:Skipping sample id=289937. Maximum sequence length: 2049, sample length: 2658 +[default0]:Skipping sample id=917650. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=1279669. Maximum sequence length: 2049, sample length: 2905 +[default0]:Skipping sample id=989793. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=711638. Maximum sequence length: 2049, sample length: 2512 +[default0]:Skipping sample id=1513224. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=715697. Maximum sequence length: 2049, sample length: 3791 +[default0]:Skipping sample id=1007655. Maximum sequence length: 2049, sample length: 2167 +[default0]:Skipping sample id=795136. Maximum sequence length: 2049, sample length: 2871 +[default0]:Skipping sample id=165018. Maximum sequence length: 2049, sample length: 3483 +[default0]:Skipping sample id=1001109. Maximum sequence length: 2049, sample length: 2296 +[default0]:Skipping sample id=269890. Maximum sequence length: 2049, sample length: 2306 +[default0]:Skipping sample id=1006748. Maximum sequence length: 2049, sample length: 2682 +[default0]:Skipping sample id=1230347. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=343362. Maximum sequence length: 2049, sample length: 3455 +[default0]:Skipping sample id=948379. Maximum sequence length: 2049, sample length: 4906 +[default0]:Skipping sample id=1392281. Maximum sequence length: 2049, sample length: 2322 +[default0]:Skipping sample id=1395938. Maximum sequence length: 2049, sample length: 3831 +[default0]:Skipping sample id=360583. Maximum sequence length: 2049, sample length: 2547 +[default0]:Skipping sample id=1538475. Maximum sequence length: 2049, sample length: 2060 +[default0]:Skipping sample id=345408. Maximum sequence length: 2049, sample length: 2805 +[default0]:Skipping sample id=242180. Maximum sequence length: 2049, sample length: 5395 +[default0]:Skipping sample id=123885. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=136797. Maximum sequence length: 2049, sample length: 3559 +[default0]:Skipping sample id=1095727. Maximum sequence length: 2049, sample length: 2348 +[default0]:Skipping sample id=1023325. Maximum sequence length: 2049, sample length: 3465 +[default0]:Skipping sample id=449427. Maximum sequence length: 2049, sample length: 3337 +[default0]:Skipping sample id=468241. Maximum sequence length: 2049, sample length: 2353 +[default0]:Skipping sample id=955664. Maximum sequence length: 2049, sample length: 3262 +[default0]:Skipping sample id=560462. Maximum sequence length: 2049, sample length: 2485 +[default0]:Skipping sample id=197336. Maximum sequence length: 2049, sample length: 3158 +[default0]:Skipping sample id=730430. Maximum sequence length: 2049, sample length: 2330 +[default0]:Skipping sample id=802852. Maximum sequence length: 2049, sample length: 3280 +[default0]:Skipping sample id=1069248. Maximum sequence length: 2049, sample length: 2937 +[default0]:Skipping sample id=1386195. Maximum sequence length: 2049, sample length: 2662 +[default0]:Skipping sample id=1409263. Maximum sequence length: 2049, sample length: 2379 +[default0]:Skipping sample id=127194. Maximum sequence length: 2049, sample length: 2210 +[default0]:Skipping sample id=500308. Maximum sequence length: 2049, sample length: 3077 +[default0]:Skipping sample id=335816. Maximum sequence length: 2049, sample length: 4184 +[default0]:Skipping sample id=243854. Maximum sequence length: 2049, sample length: 2450 +[default0]:Skipping sample id=631174. Maximum sequence length: 2049, sample length: 2064 +[default0]:Skipping sample id=521592. Maximum sequence length: 2049, sample length: 3009 +[default0]:Skipping sample id=230931. Maximum sequence length: 2049, sample length: 3032 +[default0]:Skipping sample id=1094650. Maximum sequence length: 2049, sample length: 2111 +[default0]:Skipping sample id=153658. Maximum sequence length: 2049, sample length: 2599 +[default0]:Skipping sample id=510190. Maximum sequence length: 2049, sample length: 2323 +[default0]:Skipping sample id=1497368. Maximum sequence length: 2049, sample length: 5116 +[default0]:Skipping sample id=1000564. Maximum sequence length: 2049, sample length: 2769 +[default0]:Skipping sample id=1367939. Maximum sequence length: 2049, sample length: 3527 +[default0]:Skipping sample id=1379841. Maximum sequence length: 2049, sample length: 5144 +[default0]:Skipping sample id=1096028. Maximum sequence length: 2049, sample length: 2893 +[default0]:Skipping sample id=510614. Maximum sequence length: 2049, sample length: 2366 +[default0]:Skipping sample id=1386403. Maximum sequence length: 2049, sample length: 2414 +[default0]:Skipping sample id=4237. Maximum sequence length: 2049, sample length: 2961 +[default0]:Skipping sample id=1260679. Maximum sequence length: 2049, sample length: 2402 +[default0]:Skipping sample id=464157. Maximum sequence length: 2049, sample length: 2606 +[default0]:Skipping sample id=877511. Maximum sequence length: 2049, sample length: 3687 +[default0]:Skipping sample id=362949. Maximum sequence length: 2049, sample length: 4924 +[default0]:Skipping sample id=644528. Maximum sequence length: 2049, sample length: 2085 +[default0]:Skipping sample id=31732. Maximum sequence length: 2049, sample length: 3030 +[default0]:Skipping sample id=204302. Maximum sequence length: 2049, sample length: 3689 +[default0]:Skipping sample id=641929. Maximum sequence length: 2049, sample length: 2597 +[default0]:Skipping sample id=1204158. Maximum sequence length: 2049, sample length: 2808 +[default0]:Skipping sample id=1368419. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=436370. Maximum sequence length: 2049, sample length: 2685 +[default0]:Skipping sample id=58851. Maximum sequence length: 2049, sample length: 2584 +[default0]:Skipping sample id=1571951. Maximum sequence length: 2049, sample length: 2365 +[default0]:Skipping sample id=876317. Maximum sequence length: 2049, sample length: 3753 +[default0]:Skipping sample id=795288. Maximum sequence length: 2049, sample length: 2105 +[default0]:Skipping sample id=794759. Maximum sequence length: 2049, sample length: 2564 +[default0]:Skipping sample id=1138597. Maximum sequence length: 2049, sample length: 2412 +[default0]:Skipping sample id=910703. Maximum sequence length: 2049, sample length: 5891 +[default0]:Skipping sample id=1146827. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=63557. Maximum sequence length: 2049, sample length: 2238 +[default0]:Skipping sample id=311563. Maximum sequence length: 2049, sample length: 2149 +[default0]:Skipping sample id=1117590. Maximum sequence length: 2049, sample length: 3236 +[default0]:Skipping sample id=883966. Maximum sequence length: 2049, sample length: 3596 +[default0]:Skipping sample id=114331. Maximum sequence length: 2049, sample length: 2073 +[default0]:Skipping sample id=1143984. Maximum sequence length: 2049, sample length: 3145 +[default0]:Skipping sample id=2675. Maximum sequence length: 2049, sample length: 2969 +[default0]:Skipping sample id=397338. Maximum sequence length: 2049, sample length: 2499 +[default0]:Skipping sample id=1515545. Maximum sequence length: 2049, sample length: 3331 +[default0]:Skipping sample id=987435. Maximum sequence length: 2049, sample length: 2876 +[default0]:Skipping sample id=96692. Maximum sequence length: 2049, sample length: 2755 +[default0]:Skipping sample id=1316423. Maximum sequence length: 2049, sample length: 2191 +[default0]:Skipping sample id=258578. Maximum sequence length: 2049, sample length: 3430 +[default0]:Skipping sample id=1342829. Maximum sequence length: 2049, sample length: 3204 +[default0]:Skipping sample id=715164. Maximum sequence length: 2049, sample length: 4339 +[default0]:Skipping sample id=222927. Maximum sequence length: 2049, sample length: 2996 +[default0]:Skipping sample id=77981. Maximum sequence length: 2049, sample length: 3094 +[default0]:Skipping sample id=268171. Maximum sequence length: 2049, sample length: 5598 +[default0]:Skipping sample id=254030. Maximum sequence length: 2049, sample length: 2477 +[default0]:Skipping sample id=640038. Maximum sequence length: 2049, sample length: 2444 +[default0]:Skipping sample id=38994. Maximum sequence length: 2049, sample length: 4023 +[default0]:Skipping sample id=1122164. Maximum sequence length: 2049, sample length: 2531 +[default0]:Skipping sample id=443380. Maximum sequence length: 2049, sample length: 2913 +[default0]:Skipping sample id=865927. Maximum sequence length: 2049, sample length: 2273 +[default0]:Skipping sample id=1149906. Maximum sequence length: 2049, sample length: 2104 +[default0]:Skipping sample id=919914. Maximum sequence length: 2049, sample length: 2838 +[default0]:Skipping sample id=826253. Maximum sequence length: 2049, sample length: 4902 +[default0]:Skipping sample id=1505796. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=1530884. Maximum sequence length: 2049, sample length: 2976 +[default0]:Skipping sample id=80143. Maximum sequence length: 2049, sample length: 2457 +[default0]:Skipping sample id=248977. Maximum sequence length: 2049, sample length: 3310 +[default0]:Skipping sample id=1144097. Maximum sequence length: 2049, sample length: 2496 +[default0]:Skipping sample id=1231265. Maximum sequence length: 2049, sample length: 2500 +[default0]:Skipping sample id=302965. Maximum sequence length: 2049, sample length: 2201 +[default0]:Skipping sample id=611220. Maximum sequence length: 2049, sample length: 2758 +[default0]:Skipping sample id=692758. Maximum sequence length: 2049, sample length: 3429 +[default0]:Skipping sample id=1003738. Maximum sequence length: 2049, sample length: 3276 +[default0]:Skipping sample id=939391. Maximum sequence length: 2049, sample length: 4001 +[default0]:Skipping sample id=1153208. Maximum sequence length: 2049, sample length: 3122 +[default0]:Skipping sample id=1252018. Maximum sequence length: 2049, sample length: 2929 +[default0]:Skipping sample id=532528. Maximum sequence length: 2049, sample length: 2530 +[default0]:Skipping sample id=490576. Maximum sequence length: 2049, sample length: 2172 +[default0]:Skipping sample id=564268. Maximum sequence length: 2049, sample length: 3493 +[default0]:Skipping sample id=690699. Maximum sequence length: 2049, sample length: 2356 +[default0]:Skipping sample id=823610. Maximum sequence length: 2049, sample length: 3594 +[default0]:Skipping sample id=641625. Maximum sequence length: 2049, sample length: 2197 +[default0]:Skipping sample id=691523. Maximum sequence length: 2049, sample length: 2181 +[default0]:Skipping sample id=855372. Maximum sequence length: 2049, sample length: 2936 +[default0]:Skipping sample id=1062520. Maximum sequence length: 2049, sample length: 2197 +[default0]:Skipping sample id=270376. Maximum sequence length: 2049, sample length: 4288 +[default0]:Skipping sample id=361446. Maximum sequence length: 2049, sample length: 3490 +[default0]:Skipping sample id=894590. Maximum sequence length: 2049, sample length: 2690 +[default0]:Skipping sample id=1275083. Maximum sequence length: 2049, sample length: 2654 +[default0]:Skipping sample id=748565. Maximum sequence length: 2049, sample length: 3334 +[default0]:Skipping sample id=1285485. Maximum sequence length: 2049, sample length: 3840 +[default0]:Skipping sample id=781280. Maximum sequence length: 2049, sample length: 2367 +[default0]:Skipping sample id=1042199. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=50703. Maximum sequence length: 2049, sample length: 3546 +[default0]:Skipping sample id=844009. Maximum sequence length: 2049, sample length: 2343 +[default0]:Skipping sample id=863577. Maximum sequence length: 2049, sample length: 2095 +[default0]:Skipping sample id=631368. Maximum sequence length: 2049, sample length: 2253 +[default0]:Skipping sample id=692251. Maximum sequence length: 2049, sample length: 3252 +[default0]:Skipping sample id=68929. Maximum sequence length: 2049, sample length: 3246 +[default0]:Skipping sample id=1001247. Maximum sequence length: 2049, sample length: 2340 +[default0]:Skipping sample id=230345. Maximum sequence length: 2049, sample length: 3188 +[default0]:Skipping sample id=1124204. Maximum sequence length: 2049, sample length: 3481 +[default0]:Skipping sample id=1046433. Maximum sequence length: 2049, sample length: 2808 +[default0]:Skipping sample id=445790. Maximum sequence length: 2049, sample length: 2354 +[default0]:Skipping sample id=1360731. Maximum sequence length: 2049, sample length: 2436 +[default0]:Skipping sample id=657538. Maximum sequence length: 2049, sample length: 4063 +[default0]:Skipping sample id=1021158. Maximum sequence length: 2049, sample length: 5358 +[default0]:Skipping sample id=771828. Maximum sequence length: 2049, sample length: 2206 +[default0]:Skipping sample id=1389323. Maximum sequence length: 2049, sample length: 3673 +[default0]:Skipping sample id=355243. Maximum sequence length: 2049, sample length: 2750 +[default0]:Skipping sample id=1197981. Maximum sequence length: 2049, sample length: 2298 +[default0]:Skipping sample id=1175100. Maximum sequence length: 2049, sample length: 2051 +[default0]:Skipping sample id=801079. Maximum sequence length: 2049, sample length: 3191 +[default0]:Skipping sample id=788789. Maximum sequence length: 2049, sample length: 3272 +[default0]:Skipping sample id=640978. Maximum sequence length: 2049, sample length: 2092 +[default0]:Skipping sample id=743357. Maximum sequence length: 2049, sample length: 2935 +[default0]:Skipping sample id=762366. Maximum sequence length: 2049, sample length: 2444 +[default0]:Skipping sample id=458508. Maximum sequence length: 2049, sample length: 3773 +[default0]:Skipping sample id=934188. Maximum sequence length: 2049, sample length: 3446 +[default0]:Skipping sample id=495636. Maximum sequence length: 2049, sample length: 3950 +[default0]:Skipping sample id=1123655. Maximum sequence length: 2049, sample length: 2570 +[default0]:Skipping sample id=38964. Maximum sequence length: 2049, sample length: 3015 +[default0]:Skipping sample id=1073400. Maximum sequence length: 2049, sample length: 2242 +[default0]:Skipping sample id=260286. Maximum sequence length: 2049, sample length: 2594 +[default0]:Skipping sample id=277193. Maximum sequence length: 2049, sample length: 2722 +[default0]:Skipping sample id=1011030. Maximum sequence length: 2049, sample length: 2495 +[default0]:Skipping sample id=823965. Maximum sequence length: 2049, sample length: 2659 +[default0]:Skipping sample id=166826. Maximum sequence length: 2049, sample length: 2203 +[default0]:Skipping sample id=1502534. Maximum sequence length: 2049, sample length: 3566 +[default0]:Skipping sample id=847570. Maximum sequence length: 2049, sample length: 2494 +[default0]:Skipping sample id=670501. Maximum sequence length: 2049, sample length: 2579 +[default0]:Skipping sample id=534239. Maximum sequence length: 2049, sample length: 2713 +[default0]:Skipping sample id=1544413. Maximum sequence length: 2049, sample length: 4650 +[default0]:Skipping sample id=1284643. Maximum sequence length: 2049, sample length: 3417 +[default0]:Skipping sample id=83012. Maximum sequence length: 2049, sample length: 4193 +[default0]:Skipping sample id=958544. Maximum sequence length: 2049, sample length: 3654 +[default0]:Skipping sample id=429345. Maximum sequence length: 2049, sample length: 2181 +[default0]:Skipping sample id=1560921. Maximum sequence length: 2049, sample length: 4216 +[default0]:Skipping sample id=991393. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=1544003. Maximum sequence length: 2049, sample length: 2947 +[default0]:Skipping sample id=717043. Maximum sequence length: 2049, sample length: 2959 +[default0]:Skipping sample id=540230. Maximum sequence length: 2049, sample length: 3283 +[default0]:Skipping sample id=1421610. Maximum sequence length: 2049, sample length: 3664 +[default0]:Skipping sample id=851462. Maximum sequence length: 2049, sample length: 2128 +[default0]:Skipping sample id=739327. Maximum sequence length: 2049, sample length: 2856 +[default0]:Skipping sample id=254921. Maximum sequence length: 2049, sample length: 2983 +[default0]:Skipping sample id=220657. Maximum sequence length: 2049, sample length: 2306 +[default0]:Skipping sample id=1128008. Maximum sequence length: 2049, sample length: 2158 +[default0]:Skipping sample id=726458. Maximum sequence length: 2049, sample length: 3007 +[default0]:Skipping sample id=1529302. Maximum sequence length: 2049, sample length: 3351 +[default0]:Skipping sample id=594277. Maximum sequence length: 2049, sample length: 2052 +[default0]:Skipping sample id=1132127. Maximum sequence length: 2049, sample length: 2090 +[default0]:Skipping sample id=632602. Maximum sequence length: 2049, sample length: 2802 +[default0]:Skipping sample id=709356. Maximum sequence length: 2049, sample length: 2914 +[default0]:Skipping sample id=271228. Maximum sequence length: 2049, sample length: 2637 +[default0]:Skipping sample id=1459366. Maximum sequence length: 2049, sample length: 4053 +[default0]:Skipping sample id=508456. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=724178. Maximum sequence length: 2049, sample length: 3934 +[default0]:Skipping sample id=55974. Maximum sequence length: 2049, sample length: 3764 +[default0]:Skipping sample id=1373442. Maximum sequence length: 2049, sample length: 2891 +[default0]:Skipping sample id=513163. Maximum sequence length: 2049, sample length: 3518 +[default0]:Skipping sample id=293217. Maximum sequence length: 2049, sample length: 2348 +[default0]:Skipping sample id=268149. Maximum sequence length: 2049, sample length: 2079 +[default0]:Skipping sample id=1136847. Maximum sequence length: 2049, sample length: 4616 +[default0]:Skipping sample id=239199. Maximum sequence length: 2049, sample length: 2313 +[default0]:Skipping sample id=1111544. Maximum sequence length: 2049, sample length: 2194 +[default0]:Skipping sample id=60761. Maximum sequence length: 2049, sample length: 3147 +[default0]:Skipping sample id=559390. Maximum sequence length: 2049, sample length: 3206 +[default0]:Skipping sample id=1240114. Maximum sequence length: 2049, sample length: 2656 +[default0]:Skipping sample id=372754. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=464202. Maximum sequence length: 2049, sample length: 2786 +[default0]:Skipping sample id=122870. Maximum sequence length: 2049, sample length: 2424 +[default0]:Skipping sample id=64808. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=1526576. Maximum sequence length: 2049, sample length: 3972 +[default0]:Skipping sample id=173442. Maximum sequence length: 2049, sample length: 2414 +[default0]:Skipping sample id=148309. Maximum sequence length: 2049, sample length: 3441 +[default0]:Skipping sample id=230743. Maximum sequence length: 2049, sample length: 2588 +[default0]:Skipping sample id=1071249. Maximum sequence length: 2049, sample length: 2051 +[default0]:Skipping sample id=252852. Maximum sequence length: 2049, sample length: 4405 +[default0]:Skipping sample id=287407. Maximum sequence length: 2049, sample length: 3974 +[default0]:Skipping sample id=500863. Maximum sequence length: 2049, sample length: 2558 +[default0]:Skipping sample id=12534. Maximum sequence length: 2049, sample length: 2704 +[default0]:Skipping sample id=1329306. Maximum sequence length: 2049, sample length: 2601 +[default0]:Skipping sample id=630036. Maximum sequence length: 2049, sample length: 4194 +[default0]:Skipping sample id=1163024. Maximum sequence length: 2049, sample length: 5018 +[default0]:Skipping sample id=997741. Maximum sequence length: 2049, sample length: 4295 +[default0]:Skipping sample id=281509. Maximum sequence length: 2049, sample length: 2798 +[default0]:Skipping sample id=576589. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=1045370. Maximum sequence length: 2049, sample length: 2951 +[default0]:Skipping sample id=213600. Maximum sequence length: 2049, sample length: 3091 +[default0]:Skipping sample id=676162. Maximum sequence length: 2049, sample length: 3320 +[default0]:Skipping sample id=1193065. Maximum sequence length: 2049, sample length: 2287 +[default0]:Skipping sample id=1510045. Maximum sequence length: 2049, sample length: 3761 +[default0]:Skipping sample id=155907. Maximum sequence length: 2049, sample length: 2775 +[default0]:Skipping sample id=407255. Maximum sequence length: 2049, sample length: 2219 +[default0]:Skipping sample id=172563. Maximum sequence length: 2049, sample length: 3555 +[default0]:Skipping sample id=1020227. Maximum sequence length: 2049, sample length: 2329 +[default0]:Skipping sample id=306577. Maximum sequence length: 2049, sample length: 2742 +[default0]:Skipping sample id=627663. Maximum sequence length: 2049, sample length: 2833 +[default0]:Skipping sample id=1093291. Maximum sequence length: 2049, sample length: 3209 +[default0]:Skipping sample id=581803. Maximum sequence length: 2049, sample length: 2995 +[default0]:Skipping sample id=390434. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=65367. Maximum sequence length: 2049, sample length: 3106 +[default0]:Skipping sample id=1450663. Maximum sequence length: 2049, sample length: 2053 +[default0]:Skipping sample id=869994. Maximum sequence length: 2049, sample length: 3336 +[default0]:Skipping sample id=622433. Maximum sequence length: 2049, sample length: 2592 +[default0]:Skipping sample id=974579. Maximum sequence length: 2049, sample length: 2314 +[default0]:Skipping sample id=198696. Maximum sequence length: 2049, sample length: 2292 +[default0]:Skipping sample id=1164819. Maximum sequence length: 2049, sample length: 4740 +[default0]:Skipping sample id=855968. Maximum sequence length: 2049, sample length: 3327 +[default0]:Skipping sample id=1502026. Maximum sequence length: 2049, sample length: 2313 +[default0]:Skipping sample id=1419838. Maximum sequence length: 2049, sample length: 2384 +[default0]:Skipping sample id=493214. Maximum sequence length: 2049, sample length: 4063 +[default0]:Skipping sample id=265368. Maximum sequence length: 2049, sample length: 5667 +[default0]:Skipping sample id=1458931. Maximum sequence length: 2049, sample length: 4640 +[default0]:Skipping sample id=1223067. Maximum sequence length: 2049, sample length: 7107 +[default0]:Skipping sample id=872880. Maximum sequence length: 2049, sample length: 4379 +[default0]:Skipping sample id=40621. Maximum sequence length: 2049, sample length: 3079 +[default0]:Skipping sample id=644385. Maximum sequence length: 2049, sample length: 2292 +[default0]:Skipping sample id=707764. Maximum sequence length: 2049, sample length: 2084 +[default0]:Skipping sample id=1286331. Maximum sequence length: 2049, sample length: 5119 +[default0]:Skipping sample id=723771. Maximum sequence length: 2049, sample length: 2824 +[default0]:Skipping sample id=868274. Maximum sequence length: 2049, sample length: 2731 +[default0]:Skipping sample id=173961. Maximum sequence length: 2049, sample length: 3214 +[default0]:Skipping sample id=212176. Maximum sequence length: 2049, sample length: 2311 +[default0]:Skipping sample id=787459. Maximum sequence length: 2049, sample length: 2359 +[default0]:Skipping sample id=850246. Maximum sequence length: 2049, sample length: 3675 +[default0]:Skipping sample id=676040. Maximum sequence length: 2049, sample length: 2655 +[default0]:Skipping sample id=737427. Maximum sequence length: 2049, sample length: 2778 +[default0]:Skipping sample id=366656. Maximum sequence length: 2049, sample length: 2887 +[default0]:Skipping sample id=1018604. Maximum sequence length: 2049, sample length: 3458 +[default0]:Skipping sample id=593002. Maximum sequence length: 2049, sample length: 2260 +[default0]:Skipping sample id=684779. Maximum sequence length: 2049, sample length: 2572 +[default0]:Skipping sample id=1499904. Maximum sequence length: 2049, sample length: 2852 +[default0]:Skipping sample id=290056. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=1325412. Maximum sequence length: 2049, sample length: 2333 +[default0]:Skipping sample id=628983. Maximum sequence length: 2049, sample length: 3459 +[default0]:Skipping sample id=798463. Maximum sequence length: 2049, sample length: 3443 +[default0]:Skipping sample id=231000. Maximum sequence length: 2049, sample length: 3577 +[default0]:Skipping sample id=1119155. Maximum sequence length: 2049, sample length: 2381 +[default0]:Skipping sample id=1046839. Maximum sequence length: 2049, sample length: 2459 +[default0]:Skipping sample id=945507. Maximum sequence length: 2049, sample length: 2181 +[default0]:Skipping sample id=830650. Maximum sequence length: 2049, sample length: 3324 +[default0]:Skipping sample id=868107. Maximum sequence length: 2049, sample length: 4155 +[default0]:Skipping sample id=1101671. Maximum sequence length: 2049, sample length: 2628 +[default0]:Skipping sample id=750683. Maximum sequence length: 2049, sample length: 2444 +[default0]:Skipping sample id=208656. Maximum sequence length: 2049, sample length: 2173 +[default0]:Skipping sample id=302118. Maximum sequence length: 2049, sample length: 2374 +[default0]:Skipping sample id=470367. Maximum sequence length: 2049, sample length: 2067 +[default0]:Skipping sample id=712971. Maximum sequence length: 2049, sample length: 2300 +[default0]:Skipping sample id=293363. Maximum sequence length: 2049, sample length: 2708 +[default0]:Skipping sample id=760378. Maximum sequence length: 2049, sample length: 2720 +[default0]:Skipping sample id=229670. Maximum sequence length: 2049, sample length: 3975 +[default0]:Skipping sample id=772742. Maximum sequence length: 2049, sample length: 3155 +[default0]:Skipping sample id=731579. Maximum sequence length: 2049, sample length: 2963 +[default0]:Skipping sample id=1231427. Maximum sequence length: 2049, sample length: 2217 +[default0]:Skipping sample id=655861. Maximum sequence length: 2049, sample length: 3141 +[default0]:Skipping sample id=572559. Maximum sequence length: 2049, sample length: 3636 +[default0]:Skipping sample id=661696. Maximum sequence length: 2049, sample length: 6947 +[default0]:Skipping sample id=552892. Maximum sequence length: 2049, sample length: 2607 +[default0]:Skipping sample id=415548. Maximum sequence length: 2049, sample length: 2168 +[default0]:Skipping sample id=1267454. Maximum sequence length: 2049, sample length: 2593 +[default0]:Skipping sample id=147912. Maximum sequence length: 2049, sample length: 2867 +[default0]:Skipping sample id=1001348. Maximum sequence length: 2049, sample length: 5733 +[default0]:Skipping sample id=1239745. Maximum sequence length: 2049, sample length: 2785 +[default0]:Skipping sample id=683632. Maximum sequence length: 2049, sample length: 2855 +[default0]:Skipping sample id=351397. Maximum sequence length: 2049, sample length: 2202 +[default0]:Skipping sample id=1517521. Maximum sequence length: 2049, sample length: 5260 +[default0]:Skipping sample id=783563. Maximum sequence length: 2049, sample length: 3084 +[default0]:Skipping sample id=1053499. Maximum sequence length: 2049, sample length: 2835 +[default0]:Skipping sample id=1011927. Maximum sequence length: 2049, sample length: 2138 +[default0]:Skipping sample id=998736. Maximum sequence length: 2049, sample length: 3116 +[default0]:Skipping sample id=732099. Maximum sequence length: 2049, sample length: 2397 +[default0]:Skipping sample id=337410. Maximum sequence length: 2049, sample length: 2118 +[default0]:Skipping sample id=1196552. Maximum sequence length: 2049, sample length: 2311 +[default0]:Skipping sample id=209050. Maximum sequence length: 2049, sample length: 3003 +[default0]:Skipping sample id=1067289. Maximum sequence length: 2049, sample length: 2714 +[default0]:Skipping sample id=852390. Maximum sequence length: 2049, sample length: 2304 +[default0]:Skipping sample id=388965. Maximum sequence length: 2049, sample length: 2321 +[default0]:Skipping sample id=1149211. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=947967. Maximum sequence length: 2049, sample length: 3450 +[default0]:Skipping sample id=138365. Maximum sequence length: 2049, sample length: 2118 +[default0]:Skipping sample id=176591. Maximum sequence length: 2049, sample length: 2488 +[default0]:Skipping sample id=631055. Maximum sequence length: 2049, sample length: 2773 +[default0]:Skipping sample id=557640. Maximum sequence length: 2049, sample length: 3941 +[default0]:Skipping sample id=1493714. Maximum sequence length: 2049, sample length: 2506 +[default0]:Skipping sample id=922497. Maximum sequence length: 2049, sample length: 3504 +[default0]:Skipping sample id=528967. Maximum sequence length: 2049, sample length: 3488 +[default0]:Skipping sample id=69118. Maximum sequence length: 2049, sample length: 4073 +[default0]:Skipping sample id=1527700. Maximum sequence length: 2049, sample length: 2121 +[default0]:Skipping sample id=1084197. Maximum sequence length: 2049, sample length: 3683 +[default0]:Skipping sample id=298215. Maximum sequence length: 2049, sample length: 3785 +[default0]:Skipping sample id=1275925. Maximum sequence length: 2049, sample length: 3203 +[default0]:Skipping sample id=913574. Maximum sequence length: 2049, sample length: 3047 +[default0]:Skipping sample id=883548. Maximum sequence length: 2049, sample length: 3429 +[default0]:Skipping sample id=904331. Maximum sequence length: 2049, sample length: 2637 +[default0]:Skipping sample id=658002. Maximum sequence length: 2049, sample length: 2224 +[default0]:Skipping sample id=1137449. Maximum sequence length: 2049, sample length: 2800 +[default0]:Skipping sample id=414127. Maximum sequence length: 2049, sample length: 3026 +[default0]:Skipping sample id=961004. Maximum sequence length: 2049, sample length: 2650 +[default0]:Skipping sample id=217352. Maximum sequence length: 2049, sample length: 2281 +[default0]:Skipping sample id=1540682. Maximum sequence length: 2049, sample length: 4095 +[default0]:Skipping sample id=1069899. Maximum sequence length: 2049, sample length: 4617 +[default0]:Skipping sample id=114997. Maximum sequence length: 2049, sample length: 3222 +[default0]:Skipping sample id=883288. Maximum sequence length: 2049, sample length: 2504 +[default0]:Skipping sample id=827639. Maximum sequence length: 2049, sample length: 2779 +[default0]:Skipping sample id=1026585. Maximum sequence length: 2049, sample length: 2138 +[default0]:Skipping sample id=1225262. Maximum sequence length: 2049, sample length: 2650 +[default0]:Skipping sample id=1203835. Maximum sequence length: 2049, sample length: 3445 +[default0]:Skipping sample id=1450249. Maximum sequence length: 2049, sample length: 2461 +[default0]:Skipping sample id=1089979. Maximum sequence length: 2049, sample length: 2366 +[default0]:Skipping sample id=368589. Maximum sequence length: 2049, sample length: 2774 +[default0]:Skipping sample id=543336. Maximum sequence length: 2049, sample length: 3899 +[default0]:Skipping sample id=169051. Maximum sequence length: 2049, sample length: 2855 +[default0]:Skipping sample id=505000. Maximum sequence length: 2049, sample length: 4746 +[default0]:Skipping sample id=1518392. Maximum sequence length: 2049, sample length: 2703 +[default0]:Skipping sample id=423936. Maximum sequence length: 2049, sample length: 2553 +[default0]:Skipping sample id=607754. Maximum sequence length: 2049, sample length: 2682 +[default0]:Skipping sample id=111882. Maximum sequence length: 2049, sample length: 2517 +[default0]:Skipping sample id=159325. Maximum sequence length: 2049, sample length: 3222 +[default0]:Skipping sample id=862717. Maximum sequence length: 2049, sample length: 2999 +[default0]:Skipping sample id=829841. Maximum sequence length: 2049, sample length: 2306 +[default0]:Skipping sample id=1048044. Maximum sequence length: 2049, sample length: 2125 +[default0]:Skipping sample id=507398. Maximum sequence length: 2049, sample length: 3938 +[default0]:Skipping sample id=900670. Maximum sequence length: 2049, sample length: 3330 +[default0]:Skipping sample id=1054390. Maximum sequence length: 2049, sample length: 2687 +[default0]:Skipping sample id=993238. Maximum sequence length: 2049, sample length: 2344 +[default0]:Skipping sample id=378695. Maximum sequence length: 2049, sample length: 2789 +[default0]:Skipping sample id=937008. Maximum sequence length: 2049, sample length: 4834 +[default0]:Skipping sample id=1437013. Maximum sequence length: 2049, sample length: 3208 +[default0]:Skipping sample id=1202981. Maximum sequence length: 2049, sample length: 2607 +[default0]:Skipping sample id=1024866. Maximum sequence length: 2049, sample length: 2533 +[default0]:Skipping sample id=721725. Maximum sequence length: 2049, sample length: 6455 +[default0]:Skipping sample id=1017418. Maximum sequence length: 2049, sample length: 2363 +[default0]:Skipping sample id=1217102. Maximum sequence length: 2049, sample length: 2590 +[default0]:Skipping sample id=1158861. Maximum sequence length: 2049, sample length: 3125 +[default0]:Skipping sample id=24557. Maximum sequence length: 2049, sample length: 2935 +[default0]:Skipping sample id=309951. Maximum sequence length: 2049, sample length: 2307 +[default0]:Skipping sample id=1232220. Maximum sequence length: 2049, sample length: 3124 +[default0]:Skipping sample id=780246. Maximum sequence length: 2049, sample length: 3919 +[default0]:Skipping sample id=410933. Maximum sequence length: 2049, sample length: 2211 +[default0]:Skipping sample id=677764. Maximum sequence length: 2049, sample length: 3157 +[default0]:Skipping sample id=812299. Maximum sequence length: 2049, sample length: 2483 +[default0]:Skipping sample id=340413. Maximum sequence length: 2049, sample length: 2362 +[default0]:Skipping sample id=876997. Maximum sequence length: 2049, sample length: 2372 +[default0]:Skipping sample id=98308. Maximum sequence length: 2049, sample length: 3596 +[default0]:Skipping sample id=429199. Maximum sequence length: 2049, sample length: 2068 +[default0]:Skipping sample id=198837. Maximum sequence length: 2049, sample length: 3966 +[default0]:Skipping sample id=11430. Maximum sequence length: 2049, sample length: 3350 +[default0]:Skipping sample id=910325. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=639986. Maximum sequence length: 2049, sample length: 2440 +[default0]:Skipping sample id=1233078. Maximum sequence length: 2049, sample length: 2061 +[default0]:Skipping sample id=169063. Maximum sequence length: 2049, sample length: 3984 +[default0]:Skipping sample id=1194112. Maximum sequence length: 2049, sample length: 3763 +[default0]:Skipping sample id=212734. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=343986. Maximum sequence length: 2049, sample length: 5166 +[default0]:Skipping sample id=1070818. Maximum sequence length: 2049, sample length: 4108 +[default0]:Skipping sample id=107424. Maximum sequence length: 2049, sample length: 4258 +[default0]:Skipping sample id=1071912. Maximum sequence length: 2049, sample length: 2678 +[default0]:Skipping sample id=662587. Maximum sequence length: 2049, sample length: 2149 +[default0]:Skipping sample id=1426374. Maximum sequence length: 2049, sample length: 3064 +[default0]:Skipping sample id=1388748. Maximum sequence length: 2049, sample length: 2541 +[default0]:Skipping sample id=415154. Maximum sequence length: 2049, sample length: 2149 +[default0]:Skipping sample id=813583. Maximum sequence length: 2049, sample length: 2743 +[default0]:Skipping sample id=1094532. Maximum sequence length: 2049, sample length: 3467 +[default0]:Skipping sample id=140727. Maximum sequence length: 2049, sample length: 4145 +[default0]:Skipping sample id=253782. Maximum sequence length: 2049, sample length: 2486 +[default0]:Skipping sample id=201057. Maximum sequence length: 2049, sample length: 2641 +[default0]:Skipping sample id=1438708. Maximum sequence length: 2049, sample length: 2898 +[default0]:Skipping sample id=747987. Maximum sequence length: 2049, sample length: 2061 +[default0]:Skipping sample id=351503. Maximum sequence length: 2049, sample length: 6173 +[default0]:Skipping sample id=1304405. Maximum sequence length: 2049, sample length: 2607 +[default0]:Skipping sample id=696821. Maximum sequence length: 2049, sample length: 2484 +[default0]:Skipping sample id=765167. Maximum sequence length: 2049, sample length: 2324 +[default0]:Skipping sample id=608469. Maximum sequence length: 2049, sample length: 2674 +[default0]:Skipping sample id=112456. Maximum sequence length: 2049, sample length: 2294 +[default0]:Skipping sample id=1013190. Maximum sequence length: 2049, sample length: 2330 +[default0]:Skipping sample id=1048040. Maximum sequence length: 2049, sample length: 2201 +[default0]:Skipping sample id=511407. Maximum sequence length: 2049, sample length: 2934 +[default0]:Skipping sample id=182258. Maximum sequence length: 2049, sample length: 4187 +[default0]:Skipping sample id=1305701. Maximum sequence length: 2049, sample length: 3537 +[default0]:Skipping sample id=263511. Maximum sequence length: 2049, sample length: 2119 +[default0]:Skipping sample id=1254338. Maximum sequence length: 2049, sample length: 4328 +[default0]:Skipping sample id=414531. Maximum sequence length: 2049, sample length: 2701 +[default0]:Skipping sample id=1115382. Maximum sequence length: 2049, sample length: 5427 +[default0]:Skipping sample id=467969. Maximum sequence length: 2049, sample length: 3214 +[default0]:Skipping sample id=1005082. Maximum sequence length: 2049, sample length: 2169 +[default0]:Skipping sample id=949379. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=208197. Maximum sequence length: 2049, sample length: 2164 +[default0]:Skipping sample id=1384659. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=217732. Maximum sequence length: 2049, sample length: 3815 +[default0]:Skipping sample id=212961. Maximum sequence length: 2049, sample length: 2363 +[default0]:Skipping sample id=633134. Maximum sequence length: 2049, sample length: 2958 +[default0]:Skipping sample id=1212664. Maximum sequence length: 2049, sample length: 2136 +[default0]:Skipping sample id=569297. Maximum sequence length: 2049, sample length: 2166 +[default0]:Skipping sample id=472664. Maximum sequence length: 2049, sample length: 2498 +[default0]:Skipping sample id=1348638. Maximum sequence length: 2049, sample length: 2373 +[default0]:Skipping sample id=447912. Maximum sequence length: 2049, sample length: 4275 +[default0]:Skipping sample id=613276. Maximum sequence length: 2049, sample length: 2498 +[default0]:Skipping sample id=1424016. Maximum sequence length: 2049, sample length: 2915 +[default0]:Skipping sample id=1507839. Maximum sequence length: 2049, sample length: 4006 +[default0]:Skipping sample id=1059963. Maximum sequence length: 2049, sample length: 2819 +[default0]:Skipping sample id=726171. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=1261346. Maximum sequence length: 2049, sample length: 3048 +[default0]:Skipping sample id=1222223. Maximum sequence length: 2049, sample length: 2816 +[default0]:Skipping sample id=1400716. Maximum sequence length: 2049, sample length: 6827 +[default0]:Skipping sample id=2579. Maximum sequence length: 2049, sample length: 2976 +[default0]:Skipping sample id=832541. Maximum sequence length: 2049, sample length: 2641 +[default0]:Skipping sample id=1307964. Maximum sequence length: 2049, sample length: 2357 +[default0]:Skipping sample id=35528. Maximum sequence length: 2049, sample length: 2713 +[default0]:Skipping sample id=562097. Maximum sequence length: 2049, sample length: 4413 +[default0]:Skipping sample id=1568872. Maximum sequence length: 2049, sample length: 3380 +[default0]:Skipping sample id=1566602. Maximum sequence length: 2049, sample length: 6205 +[default0]:Skipping sample id=659687. Maximum sequence length: 2049, sample length: 3226 +[default0]:Skipping sample id=531020. Maximum sequence length: 2049, sample length: 2189 +[default0]:Skipping sample id=830779. Maximum sequence length: 2049, sample length: 2426 +[default0]:Skipping sample id=1113945. Maximum sequence length: 2049, sample length: 2113 +[default0]:Skipping sample id=1116311. Maximum sequence length: 2049, sample length: 2577 +[default0]:Skipping sample id=464957. Maximum sequence length: 2049, sample length: 3970 +[default0]:Skipping sample id=534787. Maximum sequence length: 2049, sample length: 3157 +[default0]:Skipping sample id=395930. Maximum sequence length: 2049, sample length: 2925 +[default0]:Skipping sample id=1237366. Maximum sequence length: 2049, sample length: 2340 +[default0]:Skipping sample id=152409. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=1345781. Maximum sequence length: 2049, sample length: 2173 +[default0]:Skipping sample id=1550928. Maximum sequence length: 2049, sample length: 2382 +[default0]:Skipping sample id=378526. Maximum sequence length: 2049, sample length: 3303 +[default0]:Skipping sample id=486641. Maximum sequence length: 2049, sample length: 2104 +[default0]:Skipping sample id=733854. Maximum sequence length: 2049, sample length: 4066 +[default0]:Skipping sample id=1433651. Maximum sequence length: 2049, sample length: 2074 +[default0]:Skipping sample id=1570974. Maximum sequence length: 2049, sample length: 2839 +[default0]:Skipping sample id=464297. Maximum sequence length: 2049, sample length: 4238 +[default0]:Skipping sample id=1538297. Maximum sequence length: 2049, sample length: 3230 +[default0]:Skipping sample id=499353. Maximum sequence length: 2049, sample length: 4969 +[default0]:Skipping sample id=613683. Maximum sequence length: 2049, sample length: 2812 +[default0]:Skipping sample id=735037. Maximum sequence length: 2049, sample length: 5276 +[default0]:Skipping sample id=327404. Maximum sequence length: 2049, sample length: 2266 +[default0]:Skipping sample id=1429288. Maximum sequence length: 2049, sample length: 4674 +[default0]:Skipping sample id=1044014. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=662827. Maximum sequence length: 2049, sample length: 2224 +[default0]:Skipping sample id=1445160. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=1219700. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=115585. Maximum sequence length: 2049, sample length: 2161 +[default0]:Skipping sample id=288575. Maximum sequence length: 2049, sample length: 3357 +[default0]:Skipping sample id=1023347. Maximum sequence length: 2049, sample length: 3310 +[default0]:Skipping sample id=658377. Maximum sequence length: 2049, sample length: 3964 +[default0]:Skipping sample id=751379. Maximum sequence length: 2049, sample length: 2144 +[default0]:Skipping sample id=616071. Maximum sequence length: 2049, sample length: 2172 +[default0]:Skipping sample id=429360. Maximum sequence length: 2049, sample length: 4141 +[default0]:Skipping sample id=777537. Maximum sequence length: 2049, sample length: 4222 +[default0]:Skipping sample id=1172724. Maximum sequence length: 2049, sample length: 4632 +[default0]:Skipping sample id=652355. Maximum sequence length: 2049, sample length: 2304 +[default0]:Skipping sample id=413102. Maximum sequence length: 2049, sample length: 3129 +[default0]:Skipping sample id=1005939. Maximum sequence length: 2049, sample length: 3395 +[default0]:Skipping sample id=939186. Maximum sequence length: 2049, sample length: 2631 +[default0]:Skipping sample id=584147. Maximum sequence length: 2049, sample length: 3415 +[default0]:Skipping sample id=1007865. Maximum sequence length: 2049, sample length: 2384 +[default0]:Skipping sample id=1163100. Maximum sequence length: 2049, sample length: 3333 +[default0]:Skipping sample id=605172. Maximum sequence length: 2049, sample length: 3250 +[default0]:Skipping sample id=1218936. Maximum sequence length: 2049, sample length: 2738 +[default0]:Skipping sample id=640687. Maximum sequence length: 2049, sample length: 3403 +[default0]:Skipping sample id=691251. Maximum sequence length: 2049, sample length: 2252 +[default0]:Skipping sample id=1503880. Maximum sequence length: 2049, sample length: 2200 +[default0]:Skipping sample id=764253. Maximum sequence length: 2049, sample length: 3553 +[default0]:Skipping sample id=253655. Maximum sequence length: 2049, sample length: 3567 +[default0]:Skipping sample id=1261392. Maximum sequence length: 2049, sample length: 2291 +[default0]:Skipping sample id=573893. Maximum sequence length: 2049, sample length: 3438 +[default0]:Skipping sample id=1123778. Maximum sequence length: 2049, sample length: 2388 +[default0]:Skipping sample id=615756. Maximum sequence length: 2049, sample length: 3283 +[default0]:Skipping sample id=1407777. Maximum sequence length: 2049, sample length: 5578 +[default0]:Skipping sample id=1461962. Maximum sequence length: 2049, sample length: 3270 +[default0]:Skipping sample id=1425315. Maximum sequence length: 2049, sample length: 2223 +[default0]:Skipping sample id=813798. Maximum sequence length: 2049, sample length: 3266 +[default0]:Skipping sample id=649631. Maximum sequence length: 2049, sample length: 2683 +[default0]:Skipping sample id=464160. Maximum sequence length: 2049, sample length: 2119 +[default0]:Skipping sample id=915910. Maximum sequence length: 2049, sample length: 3392 +[default0]:Skipping sample id=881002. Maximum sequence length: 2049, sample length: 2083 +[default0]:Skipping sample id=714328. Maximum sequence length: 2049, sample length: 3103 +[default0]:Skipping sample id=896592. Maximum sequence length: 2049, sample length: 2195 +[default0]:Skipping sample id=653672. Maximum sequence length: 2049, sample length: 3129 +[default0]:Skipping sample id=730439. Maximum sequence length: 2049, sample length: 2241 +[default0]:Skipping sample id=1257203. Maximum sequence length: 2049, sample length: 3179 +[default0]:Skipping sample id=1350608. Maximum sequence length: 2049, sample length: 2668 +[default0]:Skipping sample id=1503957. Maximum sequence length: 2049, sample length: 2182 +[default0]:Skipping sample id=1179393. Maximum sequence length: 2049, sample length: 2174 +[default0]:Skipping sample id=1480163. Maximum sequence length: 2049, sample length: 2817 +[default0]:Skipping sample id=854176. Maximum sequence length: 2049, sample length: 2251 +[default0]:Skipping sample id=755602. Maximum sequence length: 2049, sample length: 2862 +[default0]:Skipping sample id=1095828. Maximum sequence length: 2049, sample length: 2588 +[default0]:Skipping sample id=928774. Maximum sequence length: 2049, sample length: 2585 +[default0]:Skipping sample id=663553. Maximum sequence length: 2049, sample length: 3385 +[default0]:Skipping sample id=1277548. Maximum sequence length: 2049, sample length: 2394 +[default0]:Skipping sample id=1348423. Maximum sequence length: 2049, sample length: 2303 +[default0]:Skipping sample id=1314091. Maximum sequence length: 2049, sample length: 3564 +[default0]:Skipping sample id=357930. Maximum sequence length: 2049, sample length: 2721 +[default0]:Skipping sample id=1152138. Maximum sequence length: 2049, sample length: 2675 +[default0]:Skipping sample id=176998. Maximum sequence length: 2049, sample length: 3109 +[default0]:Skipping sample id=567570. Maximum sequence length: 2049, sample length: 3887 +[default0]:Skipping sample id=1006337. Maximum sequence length: 2049, sample length: 4013 +[default0]:Skipping sample id=590323. Maximum sequence length: 2049, sample length: 2461 +[default0]:Skipping sample id=517158. Maximum sequence length: 2049, sample length: 2237 +[default0]:Skipping sample id=1556896. Maximum sequence length: 2049, sample length: 2103 +[default0]:Skipping sample id=1135600. Maximum sequence length: 2049, sample length: 2230 +[default0]:Skipping sample id=1087273. Maximum sequence length: 2049, sample length: 2308 +[default0]:Skipping sample id=1475943. Maximum sequence length: 2049, sample length: 2798 +[default0]:Skipping sample id=261512. Maximum sequence length: 2049, sample length: 3424 +[default0]:Skipping sample id=422897. Maximum sequence length: 2049, sample length: 2247 +[default0]:Skipping sample id=148495. Maximum sequence length: 2049, sample length: 2091 +[default0]:Skipping sample id=1005581. Maximum sequence length: 2049, sample length: 3582 +[default0]:Skipping sample id=1363750. Maximum sequence length: 2049, sample length: 2631 +[default0]:Skipping sample id=1303903. Maximum sequence length: 2049, sample length: 3200 +[default0]:Skipping sample id=1217469. Maximum sequence length: 2049, sample length: 3432 +[default0]:Skipping sample id=149806. Maximum sequence length: 2049, sample length: 3710 +[default0]:Skipping sample id=707511. Maximum sequence length: 2049, sample length: 2941 +[default0]:Skipping sample id=1566199. Maximum sequence length: 2049, sample length: 3504 +[default0]:Skipping sample id=976575. Maximum sequence length: 2049, sample length: 2816 +[default0]:Skipping sample id=1008218. Maximum sequence length: 2049, sample length: 4063 +[default0]:Skipping sample id=633671. Maximum sequence length: 2049, sample length: 2219 +[default0]:Skipping sample id=958994. Maximum sequence length: 2049, sample length: 2858 +[default0]:Skipping sample id=533399. Maximum sequence length: 2049, sample length: 2348 +[default0]:Skipping sample id=507847. Maximum sequence length: 2049, sample length: 3423 +[default0]:Skipping sample id=721015. Maximum sequence length: 2049, sample length: 4131 +[default0]:Skipping sample id=531698. Maximum sequence length: 2049, sample length: 2224 +[default0]:Skipping sample id=1326813. Maximum sequence length: 2049, sample length: 2799 +[default0]:Skipping sample id=121444. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=1444780. Maximum sequence length: 2049, sample length: 2214 +[default0]:Skipping sample id=917556. Maximum sequence length: 2049, sample length: 2182 +[default0]:Skipping sample id=1468363. Maximum sequence length: 2049, sample length: 2068 +[default0]:Skipping sample id=1525592. Maximum sequence length: 2049, sample length: 2075 +[default0]:Skipping sample id=701774. Maximum sequence length: 2049, sample length: 3001 +[default0]:Skipping sample id=1224084. Maximum sequence length: 2049, sample length: 2314 +[default0]:Skipping sample id=1407019. Maximum sequence length: 2049, sample length: 2673 +[default0]:Skipping sample id=60172. Maximum sequence length: 2049, sample length: 3268 +[default0]:Skipping sample id=742796. Maximum sequence length: 2049, sample length: 2883 +[default0]:Skipping sample id=711491. Maximum sequence length: 2049, sample length: 3049 +[default0]:Skipping sample id=1566519. Maximum sequence length: 2049, sample length: 2548 +[default0]:Skipping sample id=477709. Maximum sequence length: 2049, sample length: 4102 +[default0]:Skipping sample id=154821. Maximum sequence length: 2049, sample length: 2543 +[default0]:Skipping sample id=467026. Maximum sequence length: 2049, sample length: 2731 +[default0]:Skipping sample id=119396. Maximum sequence length: 2049, sample length: 2213 +[default0]:Skipping sample id=739581. Maximum sequence length: 2049, sample length: 4509 +[default0]:Skipping sample id=188864. Maximum sequence length: 2049, sample length: 2566 +[default0]:Skipping sample id=931800. Maximum sequence length: 2049, sample length: 3377 +[default0]:Skipping sample id=352493. Maximum sequence length: 2049, sample length: 2478 +[default0]:Skipping sample id=714251. Maximum sequence length: 2049, sample length: 2804 +[default0]:Skipping sample id=861210. Maximum sequence length: 2049, sample length: 2472 +[default0]:Skipping sample id=175414. Maximum sequence length: 2049, sample length: 4479 +[default0]:Skipping sample id=1481404. Maximum sequence length: 2049, sample length: 2166 +[default0]:Skipping sample id=879449. Maximum sequence length: 2049, sample length: 4542 +[default0]:Skipping sample id=1133479. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=421509. Maximum sequence length: 2049, sample length: 2666 +[default0]:Skipping sample id=1294746. Maximum sequence length: 2049, sample length: 2174 +[default0]:Skipping sample id=1520307. Maximum sequence length: 2049, sample length: 2373 +[default0]:Skipping sample id=920368. Maximum sequence length: 2049, sample length: 2229 +[default0]:Skipping sample id=68072. Maximum sequence length: 2049, sample length: 2334 +[default0]:Skipping sample id=1482195. Maximum sequence length: 2049, sample length: 2124 +[default0]:Skipping sample id=1205036. Maximum sequence length: 2049, sample length: 2914 +[default0]:Skipping sample id=303940. Maximum sequence length: 2049, sample length: 2397 +[default0]:Skipping sample id=1004828. Maximum sequence length: 2049, sample length: 4615 +[default0]:Skipping sample id=1073277. Maximum sequence length: 2049, sample length: 2507 +[default0]:Skipping sample id=1097237. Maximum sequence length: 2049, sample length: 4125 +[default0]:Skipping sample id=585731. Maximum sequence length: 2049, sample length: 3310 +[default0]:Skipping sample id=1466031. Maximum sequence length: 2049, sample length: 2407 +[default0]:Skipping sample id=1506211. Maximum sequence length: 2049, sample length: 2737 +[default0]:Skipping sample id=627671. Maximum sequence length: 2049, sample length: 3741 +[default0]:Skipping sample id=376644. Maximum sequence length: 2049, sample length: 2469 +[default0]:Skipping sample id=326332. Maximum sequence length: 2049, sample length: 2383 +[default0]:Skipping sample id=913536. Maximum sequence length: 2049, sample length: 3041 +[default0]:Skipping sample id=1066996. Maximum sequence length: 2049, sample length: 2970 +[default0]:Skipping sample id=1207218. Maximum sequence length: 2049, sample length: 2081 +[default0]:Skipping sample id=830876. Maximum sequence length: 2049, sample length: 3146 +[default0]:Skipping sample id=67189. Maximum sequence length: 2049, sample length: 2051 +[default0]:Skipping sample id=1568594. Maximum sequence length: 2049, sample length: 2081 +[default0]:Skipping sample id=87142. Maximum sequence length: 2049, sample length: 2318 +[default0]:Skipping sample id=70584. Maximum sequence length: 2049, sample length: 2156 +[default0]:Skipping sample id=575750. Maximum sequence length: 2049, sample length: 3427 +[default0]:Skipping sample id=774326. Maximum sequence length: 2049, sample length: 4770 +[default0]:Skipping sample id=110581. Maximum sequence length: 2049, sample length: 2348 +[default0]:Skipping sample id=709943. Maximum sequence length: 2049, sample length: 2066 +[default0]:Skipping sample id=428120. Maximum sequence length: 2049, sample length: 2345 +[default0]:Skipping sample id=1086794. Maximum sequence length: 2049, sample length: 2621 +[default0]:Skipping sample id=131380. Maximum sequence length: 2049, sample length: 2676 +[default0]:Skipping sample id=1496297. Maximum sequence length: 2049, sample length: 3008 +[default0]:Skipping sample id=1125870. Maximum sequence length: 2049, sample length: 2212 +[default0]:Skipping sample id=1170037. Maximum sequence length: 2049, sample length: 2322 +[default0]:Skipping sample id=123613. Maximum sequence length: 2049, sample length: 2224 +[default0]:Skipping sample id=333889. Maximum sequence length: 2049, sample length: 2055 +[default0]:Skipping sample id=600632. Maximum sequence length: 2049, sample length: 3000 +[default0]:Skipping sample id=652231. Maximum sequence length: 2049, sample length: 3334 +[default0]:Skipping sample id=970807. Maximum sequence length: 2049, sample length: 2237 +[default0]:Skipping sample id=379742. Maximum sequence length: 2049, sample length: 2156 +[default0]:Skipping sample id=682569. Maximum sequence length: 2049, sample length: 2580 +[default0]:Skipping sample id=941432. Maximum sequence length: 2049, sample length: 3134 +[default0]:Skipping sample id=167320. Maximum sequence length: 2049, sample length: 2808 +[default0]:Skipping sample id=760225. Maximum sequence length: 2049, sample length: 4317 +[default0]:Skipping sample id=281797. Maximum sequence length: 2049, sample length: 2231 +[default0]:Skipping sample id=971583. Maximum sequence length: 2049, sample length: 2432 +[default0]:Skipping sample id=1003953. Maximum sequence length: 2049, sample length: 2740 +[default0]:Skipping sample id=685954. Maximum sequence length: 2049, sample length: 2211 +[default0]:Skipping sample id=435246. Maximum sequence length: 2049, sample length: 2690 +[default0]:Skipping sample id=208506. Maximum sequence length: 2049, sample length: 2609 +[default0]:Skipping sample id=1018162. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=1138770. Maximum sequence length: 2049, sample length: 2320 +[default0]:Skipping sample id=668374. Maximum sequence length: 2049, sample length: 3024 +[default0]:Skipping sample id=678094. Maximum sequence length: 2049, sample length: 4681 +[default0]:Skipping sample id=1309445. Maximum sequence length: 2049, sample length: 2208 +[default0]:Skipping sample id=198198. Maximum sequence length: 2049, sample length: 2209 +[default0]:Skipping sample id=887556. Maximum sequence length: 2049, sample length: 6387 +[default0]:Skipping sample id=134115. Maximum sequence length: 2049, sample length: 3339 +[default0]:Skipping sample id=1353247. Maximum sequence length: 2049, sample length: 3341 +[default0]:Skipping sample id=1279397. Maximum sequence length: 2049, sample length: 2632 +[default0]:Skipping sample id=605867. Maximum sequence length: 2049, sample length: 3713 +[default0]:Skipping sample id=102571. Maximum sequence length: 2049, sample length: 2058 +[default0]:Skipping sample id=72906. Maximum sequence length: 2049, sample length: 4000 +[default0]:Skipping sample id=1352039. Maximum sequence length: 2049, sample length: 3330 +[default0]:Skipping sample id=1383417. Maximum sequence length: 2049, sample length: 3416 +[default0]:Skipping sample id=1542109. Maximum sequence length: 2049, sample length: 4326 +[default0]:Skipping sample id=453692. Maximum sequence length: 2049, sample length: 3971 +[default0]:Skipping sample id=714410. Maximum sequence length: 2049, sample length: 2999 +[default0]:Skipping sample id=1800. Maximum sequence length: 2049, sample length: 2215 +[default0]:Skipping sample id=1167401. Maximum sequence length: 2049, sample length: 2381 +[default0]:Skipping sample id=1207697. Maximum sequence length: 2049, sample length: 2122 +[default0]:Skipping sample id=374304. Maximum sequence length: 2049, sample length: 2721 +[default0]:Skipping sample id=439384. Maximum sequence length: 2049, sample length: 2437 +[default0]:Skipping sample id=723890. Maximum sequence length: 2049, sample length: 2678 +[default0]:Skipping sample id=422264. Maximum sequence length: 2049, sample length: 2462 +[default0]:Skipping sample id=1232410. Maximum sequence length: 2049, sample length: 3140 +[default0]:Skipping sample id=664244. Maximum sequence length: 2049, sample length: 2259 +[default0]:Skipping sample id=528852. Maximum sequence length: 2049, sample length: 3472 +[default0]:Skipping sample id=735608. Maximum sequence length: 2049, sample length: 3519 +[default0]:Skipping sample id=652183. Maximum sequence length: 2049, sample length: 3767 +[default0]:Skipping sample id=643107. Maximum sequence length: 2049, sample length: 2839 +[default0]:Skipping sample id=1238978. Maximum sequence length: 2049, sample length: 2742 +[default0]:Skipping sample id=365887. Maximum sequence length: 2049, sample length: 3999 +[default0]:Skipping sample id=1117589. Maximum sequence length: 2049, sample length: 3354 +[default0]:Skipping sample id=104956. Maximum sequence length: 2049, sample length: 2721 +[default0]:Skipping sample id=589994. Maximum sequence length: 2049, sample length: 4747 +[default0]:Skipping sample id=183592. Maximum sequence length: 2049, sample length: 3234 +[default0]:Skipping sample id=583820. Maximum sequence length: 2049, sample length: 2198 +[default0]:Skipping sample id=667657. Maximum sequence length: 2049, sample length: 2529 +[default0]:Skipping sample id=244531. Maximum sequence length: 2049, sample length: 2338 +[default0]:Skipping sample id=514439. Maximum sequence length: 2049, sample length: 4167 +[default0]:Skipping sample id=84202. Maximum sequence length: 2049, sample length: 2958 +[default0]:Skipping sample id=48234. Maximum sequence length: 2049, sample length: 3966 +[default0]:Skipping sample id=1161173. Maximum sequence length: 2049, sample length: 4546 +[default0]:Skipping sample id=1333400. Maximum sequence length: 2049, sample length: 2142 +[default0]:Skipping sample id=573699. Maximum sequence length: 2049, sample length: 3626 +[default0]:Skipping sample id=320939. Maximum sequence length: 2049, sample length: 2972 +[default0]:Skipping sample id=1132303. Maximum sequence length: 2049, sample length: 2798 +[default0]:Skipping sample id=53802. Maximum sequence length: 2049, sample length: 2747 +[default0]:Skipping sample id=1488129. Maximum sequence length: 2049, sample length: 2555 +[default0]:Skipping sample id=293358. Maximum sequence length: 2049, sample length: 3914 +[default0]:Skipping sample id=372850. Maximum sequence length: 2049, sample length: 3129 +[default0]:Skipping sample id=1020548. Maximum sequence length: 2049, sample length: 2400 +[default0]:Skipping sample id=285493. Maximum sequence length: 2049, sample length: 2091 +[default0]:Skipping sample id=1323760. Maximum sequence length: 2049, sample length: 3278 +[default0]:Skipping sample id=227752. Maximum sequence length: 2049, sample length: 2079 +[default0]:Skipping sample id=1346929. Maximum sequence length: 2049, sample length: 2266 +[default0]:Skipping sample id=481858. Maximum sequence length: 2049, sample length: 4093 +[default0]:Skipping sample id=849065. Maximum sequence length: 2049, sample length: 2439 +[default0]:Skipping sample id=828402. Maximum sequence length: 2049, sample length: 2468 +[default0]:Skipping sample id=1279149. Maximum sequence length: 2049, sample length: 4225 +[default0]:Skipping sample id=801992. Maximum sequence length: 2049, sample length: 3013 +[default0]:Skipping sample id=24446. Maximum sequence length: 2049, sample length: 2340 +[default0]:Skipping sample id=56636. Maximum sequence length: 2049, sample length: 2341 +[default0]:Skipping sample id=1080541. Maximum sequence length: 2049, sample length: 2096 +[default0]:Skipping sample id=1426222. Maximum sequence length: 2049, sample length: 2970 +[default0]:Skipping sample id=1280138. Maximum sequence length: 2049, sample length: 3897 +[default0]:Skipping sample id=1331276. Maximum sequence length: 2049, sample length: 2336 +[default0]:Skipping sample id=1246611. Maximum sequence length: 2049, sample length: 5536 +[default0]:Skipping sample id=1210751. Maximum sequence length: 2049, sample length: 2320 +[default0]:Skipping sample id=1496049. Maximum sequence length: 2049, sample length: 2944 +[default0]:Skipping sample id=1187922. Maximum sequence length: 2049, sample length: 2229 +[default0]:Skipping sample id=1199825. Maximum sequence length: 2049, sample length: 2307 +[default0]:Skipping sample id=978706. Maximum sequence length: 2049, sample length: 3098 +[default0]:Skipping sample id=1142019. Maximum sequence length: 2049, sample length: 2312 +[default0]:Skipping sample id=1463054. Maximum sequence length: 2049, sample length: 3408 +[default0]:Skipping sample id=277606. Maximum sequence length: 2049, sample length: 2568 +[default0]:Skipping sample id=221410. Maximum sequence length: 2049, sample length: 5137 +[default0]:Skipping sample id=216063. Maximum sequence length: 2049, sample length: 2927 +[default0]:Skipping sample id=1122774. Maximum sequence length: 2049, sample length: 2703 +[default0]:Skipping sample id=877410. Maximum sequence length: 2049, sample length: 2354 +[default0]:Skipping sample id=1166714. Maximum sequence length: 2049, sample length: 2488 +[default0]:Skipping sample id=294785. Maximum sequence length: 2049, sample length: 2518 +[default0]:Skipping sample id=437708. Maximum sequence length: 2049, sample length: 2097 +[default0]:Skipping sample id=387863. Maximum sequence length: 2049, sample length: 2234 +[default0]:Skipping sample id=128233. Maximum sequence length: 2049, sample length: 5067 +[default0]:Skipping sample id=194850. Maximum sequence length: 2049, sample length: 2349 +[default0]:Skipping sample id=481611. Maximum sequence length: 2049, sample length: 2164 +[default0]:Skipping sample id=1150855. Maximum sequence length: 2049, sample length: 2208 +[default0]:Skipping sample id=285374. Maximum sequence length: 2049, sample length: 2251 +[default0]:Skipping sample id=1213178. Maximum sequence length: 2049, sample length: 2249 +[default0]:Skipping sample id=659042. Maximum sequence length: 2049, sample length: 2495 +[default0]:Skipping sample id=35851. Maximum sequence length: 2049, sample length: 2621 +[default0]:Skipping sample id=1073664. Maximum sequence length: 2049, sample length: 3276 +[default0]:Skipping sample id=476582. Maximum sequence length: 2049, sample length: 2743 +[default0]:Skipping sample id=201074. Maximum sequence length: 2049, sample length: 3834 +[default0]:Skipping sample id=996980. Maximum sequence length: 2049, sample length: 2062 +[default0]:Skipping sample id=269821. Maximum sequence length: 2049, sample length: 3940 +[default0]:Skipping sample id=1404430. Maximum sequence length: 2049, sample length: 2252 +[default0]:Skipping sample id=726586. Maximum sequence length: 2049, sample length: 5200 +[default0]:Skipping sample id=898342. Maximum sequence length: 2049, sample length: 2107 +[default0]:Skipping sample id=1053784. Maximum sequence length: 2049, sample length: 3348 +[default0]:Skipping sample id=1066489. Maximum sequence length: 2049, sample length: 6368 +[default0]:Skipping sample id=1449058. Maximum sequence length: 2049, sample length: 2371 +[default0]:Skipping sample id=5575. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=1512196. Maximum sequence length: 2049, sample length: 2746 +[default0]:Skipping sample id=79190. Maximum sequence length: 2049, sample length: 3116 +[default0]:Skipping sample id=843355. Maximum sequence length: 2049, sample length: 5704 +[default0]:Skipping sample id=1048604. Maximum sequence length: 2049, sample length: 2772 +[default0]:Skipping sample id=564303. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=369246. Maximum sequence length: 2049, sample length: 2738 +[default0]:Skipping sample id=536315. Maximum sequence length: 2049, sample length: 5218 +[default0]:Skipping sample id=1230860. Maximum sequence length: 2049, sample length: 3069 +[default0]:Skipping sample id=1096525. Maximum sequence length: 2049, sample length: 3054 +[default0]:Skipping sample id=1327501. Maximum sequence length: 2049, sample length: 2484 +[default0]:Skipping sample id=935859. Maximum sequence length: 2049, sample length: 6104 +[default0]:Skipping sample id=1149390. Maximum sequence length: 2049, sample length: 4570 +[default0]:Skipping sample id=1176368. Maximum sequence length: 2049, sample length: 2668 +[default0]:Skipping sample id=152. Maximum sequence length: 2049, sample length: 2266 +[default0]:Skipping sample id=595809. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=761479. Maximum sequence length: 2049, sample length: 4629 +[default0]:Skipping sample id=828913. Maximum sequence length: 2049, sample length: 2210 +[default0]:Skipping sample id=242231. Maximum sequence length: 2049, sample length: 4428 +[default0]:Skipping sample id=93036. Maximum sequence length: 2049, sample length: 2874 +[default0]:Skipping sample id=1403529. Maximum sequence length: 2049, sample length: 2574 +[default0]:Skipping sample id=1411010. Maximum sequence length: 2049, sample length: 2443 +[default0]:Skipping sample id=1542232. Maximum sequence length: 2049, sample length: 2594 +[default0]:Skipping sample id=954609. Maximum sequence length: 2049, sample length: 2635 +[default0]:Skipping sample id=1365747. Maximum sequence length: 2049, sample length: 2581 +[default0]:Skipping sample id=252254. Maximum sequence length: 2049, sample length: 2341 +[default0]:Skipping sample id=891716. Maximum sequence length: 2049, sample length: 4904 +[default0]:Skipping sample id=1141229. Maximum sequence length: 2049, sample length: 2380 +[default0]:Skipping sample id=994409. Maximum sequence length: 2049, sample length: 2624 +[default0]:Skipping sample id=1402961. Maximum sequence length: 2049, sample length: 3191 +[default0]:Skipping sample id=16252. Maximum sequence length: 2049, sample length: 3679 +[default0]:Skipping sample id=1231892. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=562819. Maximum sequence length: 2049, sample length: 3241 +[default0]:Skipping sample id=1096861. Maximum sequence length: 2049, sample length: 4688 +[default0]:Skipping sample id=555588. Maximum sequence length: 2049, sample length: 3600 +[default0]:Skipping sample id=450896. Maximum sequence length: 2049, sample length: 2868 +[default0]:Skipping sample id=1001793. Maximum sequence length: 2049, sample length: 3030 +[default0]:Skipping sample id=288630. Maximum sequence length: 2049, sample length: 4959 +[default0]:Skipping sample id=587492. Maximum sequence length: 2049, sample length: 2746 +[default0]:Skipping sample id=725940. Maximum sequence length: 2049, sample length: 3399 +[default0]:Skipping sample id=1007694. Maximum sequence length: 2049, sample length: 3409 +[default0]:Skipping sample id=1188904. Maximum sequence length: 2049, sample length: 2451 +[default0]:Skipping sample id=1400732. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=402329. Maximum sequence length: 2049, sample length: 4644 +[default0]:Skipping sample id=1406588. Maximum sequence length: 2049, sample length: 2662 +[default0]:Skipping sample id=1076805. Maximum sequence length: 2049, sample length: 2200 +[default0]:Skipping sample id=1125233. Maximum sequence length: 2049, sample length: 2145 +[default0]:Skipping sample id=634767. Maximum sequence length: 2049, sample length: 2067 +[default0]:Skipping sample id=1308031. Maximum sequence length: 2049, sample length: 3815 +[default0]:Skipping sample id=135671. Maximum sequence length: 2049, sample length: 2719 +[default0]:Skipping sample id=845086. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=1020037. Maximum sequence length: 2049, sample length: 3840 +[default0]:Skipping sample id=1124704. Maximum sequence length: 2049, sample length: 2605 +[default0]:Skipping sample id=869052. Maximum sequence length: 2049, sample length: 2296 +[default0]:Skipping sample id=327720. Maximum sequence length: 2049, sample length: 2473 +[default0]:Skipping sample id=1312399. Maximum sequence length: 2049, sample length: 2379 +[default0]:Skipping sample id=975842. Maximum sequence length: 2049, sample length: 2113 +[default0]:Skipping sample id=515591. Maximum sequence length: 2049, sample length: 2666 +[default0]:Skipping sample id=363000. Maximum sequence length: 2049, sample length: 2159 +[default0]:Skipping sample id=840722. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=1286766. Maximum sequence length: 2049, sample length: 2488 +[default0]:Skipping sample id=625686. Maximum sequence length: 2049, sample length: 2386 +[default0]:Skipping sample id=546688. Maximum sequence length: 2049, sample length: 3314 +[default0]:Skipping sample id=168187. Maximum sequence length: 2049, sample length: 2496 +[default0]:Skipping sample id=551428. Maximum sequence length: 2049, sample length: 2994 +[default0]:Skipping sample id=401649. Maximum sequence length: 2049, sample length: 2555 +[default0]:Skipping sample id=478101. Maximum sequence length: 2049, sample length: 3434 +[default0]:Skipping sample id=890699. Maximum sequence length: 2049, sample length: 3910 +[default0]:Skipping sample id=368873. Maximum sequence length: 2049, sample length: 3428 +[default0]:Skipping sample id=470045. Maximum sequence length: 2049, sample length: 2439 +[default0]:Skipping sample id=50243. Maximum sequence length: 2049, sample length: 2831 +[default0]:Skipping sample id=934882. Maximum sequence length: 2049, sample length: 2359 +[default0]:Skipping sample id=1389630. Maximum sequence length: 2049, sample length: 2067 +[default0]:Skipping sample id=614007. Maximum sequence length: 2049, sample length: 2575 +[default0]:Skipping sample id=945263. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=1460125. Maximum sequence length: 2049, sample length: 2492 +[default0]:Skipping sample id=927524. Maximum sequence length: 2049, sample length: 2844 +[default0]:Skipping sample id=613835. Maximum sequence length: 2049, sample length: 2183 +[default0]:Skipping sample id=584453. Maximum sequence length: 2049, sample length: 2174 +[default0]:Skipping sample id=92138. Maximum sequence length: 2049, sample length: 6809 +[default0]:Skipping sample id=517336. Maximum sequence length: 2049, sample length: 2996 +[default0]:Skipping sample id=333689. Maximum sequence length: 2049, sample length: 3288 +[default0]:Skipping sample id=65493. Maximum sequence length: 2049, sample length: 2581 +[default0]:Skipping sample id=1296718. Maximum sequence length: 2049, sample length: 2182 +[default0]:Skipping sample id=639615. Maximum sequence length: 2049, sample length: 2064 +[default0]:Skipping sample id=1214392. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=547884. Maximum sequence length: 2049, sample length: 3188 +[default0]:Skipping sample id=583555. Maximum sequence length: 2049, sample length: 3103 +[default0]:Skipping sample id=573782. Maximum sequence length: 2049, sample length: 2053 +[default0]:Skipping sample id=1569232. Maximum sequence length: 2049, sample length: 3079 +[default0]:Skipping sample id=929743. Maximum sequence length: 2049, sample length: 2462 +[default0]:Skipping sample id=1562784. Maximum sequence length: 2049, sample length: 2198 +[default0]:Skipping sample id=888456. Maximum sequence length: 2049, sample length: 2298 +[default0]:Skipping sample id=220914. Maximum sequence length: 2049, sample length: 2228 +[default0]:Skipping sample id=17098. Maximum sequence length: 2049, sample length: 2547 +[default0]:Skipping sample id=752055. Maximum sequence length: 2049, sample length: 2237 +[default0]:Skipping sample id=586713. Maximum sequence length: 2049, sample length: 2077 +[default0]:Skipping sample id=682406. Maximum sequence length: 2049, sample length: 2719 +[default0]:Skipping sample id=425481. Maximum sequence length: 2049, sample length: 2299 +[default0]:Skipping sample id=13520. Maximum sequence length: 2049, sample length: 3370 +[default0]:Skipping sample id=1173025. Maximum sequence length: 2049, sample length: 2057 +[default0]:Skipping sample id=1635. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=1528382. Maximum sequence length: 2049, sample length: 2769 +[default0]:Skipping sample id=1161990. Maximum sequence length: 2049, sample length: 2505 +[default0]:Skipping sample id=939829. Maximum sequence length: 2049, sample length: 2411 +[default0]:Skipping sample id=292817. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=122804. Maximum sequence length: 2049, sample length: 3521 +[default0]:Skipping sample id=381120. Maximum sequence length: 2049, sample length: 3638 +[default0]:Skipping sample id=291260. Maximum sequence length: 2049, sample length: 2385 +[default0]:Skipping sample id=370592. Maximum sequence length: 2049, sample length: 2097 +[default0]:Skipping sample id=868583. Maximum sequence length: 2049, sample length: 2643 +[default0]:Skipping sample id=472440. Maximum sequence length: 2049, sample length: 3784 +[default0]:Skipping sample id=628230. Maximum sequence length: 2049, sample length: 3687 +[default0]:Skipping sample id=902482. Maximum sequence length: 2049, sample length: 2219 +[default0]:Skipping sample id=1182438. Maximum sequence length: 2049, sample length: 2309 +[default0]:Skipping sample id=1425736. Maximum sequence length: 2049, sample length: 2347 +[default0]:Skipping sample id=1275579. Maximum sequence length: 2049, sample length: 2314 +[default0]:Skipping sample id=1210014. Maximum sequence length: 2049, sample length: 2073 +[default0]:Skipping sample id=75624. Maximum sequence length: 2049, sample length: 3473 +[default0]:Skipping sample id=1232579. Maximum sequence length: 2049, sample length: 2472 +[default0]:Skipping sample id=979755. Maximum sequence length: 2049, sample length: 2711 +[default0]:Skipping sample id=1483221. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=1261058. Maximum sequence length: 2049, sample length: 2997 +[default0]:Skipping sample id=1475163. Maximum sequence length: 2049, sample length: 4585 +[default0]:Skipping sample id=176132. Maximum sequence length: 2049, sample length: 5227 +[default0]:Skipping sample id=1280236. Maximum sequence length: 2049, sample length: 3650 +[default0]:Skipping sample id=1340856. Maximum sequence length: 2049, sample length: 2508 +[default0]:Skipping sample id=1225318. Maximum sequence length: 2049, sample length: 2914 +[default0]:Skipping sample id=498779. Maximum sequence length: 2049, sample length: 2290 +[default0]:Skipping sample id=864200. Maximum sequence length: 2049, sample length: 2325 +[default0]:Skipping sample id=269372. Maximum sequence length: 2049, sample length: 2474 +[default0]:Skipping sample id=759638. Maximum sequence length: 2049, sample length: 2343 +[default0]:Skipping sample id=1422960. Maximum sequence length: 2049, sample length: 2485 +[default0]:Skipping sample id=599332. Maximum sequence length: 2049, sample length: 2476 +[default0]:Skipping sample id=556945. Maximum sequence length: 2049, sample length: 3737 +[default0]:Skipping sample id=555753. Maximum sequence length: 2049, sample length: 3481 +[default0]:Skipping sample id=1549473. Maximum sequence length: 2049, sample length: 2596 +[default0]:Skipping sample id=1123857. Maximum sequence length: 2049, sample length: 2348 +[default0]:Skipping sample id=417913. Maximum sequence length: 2049, sample length: 3600 +[default0]:Skipping sample id=1520907. Maximum sequence length: 2049, sample length: 2110 +[default0]:Skipping sample id=55800. Maximum sequence length: 2049, sample length: 2660 +[default0]:Skipping sample id=108462. Maximum sequence length: 2049, sample length: 3027 +[default0]:Skipping sample id=1320653. Maximum sequence length: 2049, sample length: 4714 +[default0]:Skipping sample id=51129. Maximum sequence length: 2049, sample length: 4244 +[default0]:Skipping sample id=1500527. Maximum sequence length: 2049, sample length: 6188 +[default0]:Skipping sample id=1072918. Maximum sequence length: 2049, sample length: 2157 +[default0]:Skipping sample id=240633. Maximum sequence length: 2049, sample length: 2724 +[default0]:Skipping sample id=527687. Maximum sequence length: 2049, sample length: 2137 +[default0]:Skipping sample id=1346248. Maximum sequence length: 2049, sample length: 3288 +[default0]:Skipping sample id=991564. Maximum sequence length: 2049, sample length: 4314 +[default0]:Skipping sample id=800769. Maximum sequence length: 2049, sample length: 2949 +[default0]:Skipping sample id=1182505. Maximum sequence length: 2049, sample length: 2195 +[default0]:Skipping sample id=1222165. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=969899. Maximum sequence length: 2049, sample length: 2580 +[default0]:Skipping sample id=1007444. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=767926. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=779553. Maximum sequence length: 2049, sample length: 2471 +[default0]:Skipping sample id=926340. Maximum sequence length: 2049, sample length: 3035 +[default0]:Skipping sample id=480768. Maximum sequence length: 2049, sample length: 2697 +[default0]:Skipping sample id=1452631. Maximum sequence length: 2049, sample length: 2334 +[default0]:Skipping sample id=498136. Maximum sequence length: 2049, sample length: 2284 +[default0]:Skipping sample id=967275. Maximum sequence length: 2049, sample length: 3139 +[default0]:Skipping sample id=1057979. Maximum sequence length: 2049, sample length: 2389 +[default0]:Skipping sample id=1303772. Maximum sequence length: 2049, sample length: 3666 +[default0]:Skipping sample id=37154. Maximum sequence length: 2049, sample length: 3332 +[default0]:Skipping sample id=1449820. Maximum sequence length: 2049, sample length: 2149 +[default0]:Skipping sample id=74589. Maximum sequence length: 2049, sample length: 2072 +[default0]:Skipping sample id=1394401. Maximum sequence length: 2049, sample length: 3994 +[default0]:Skipping sample id=809963. Maximum sequence length: 2049, sample length: 3281 +[default0]:Skipping sample id=1465045. Maximum sequence length: 2049, sample length: 2889 +[default0]:Skipping sample id=231567. Maximum sequence length: 2049, sample length: 3746 +[default0]:Skipping sample id=1356325. Maximum sequence length: 2049, sample length: 2512 +[default0]:Skipping sample id=913918. Maximum sequence length: 2049, sample length: 2190 +[default0]:Skipping sample id=1359431. Maximum sequence length: 2049, sample length: 2676 +[default0]:Skipping sample id=379134. Maximum sequence length: 2049, sample length: 2716 +[default0]:Skipping sample id=1568050. Maximum sequence length: 2049, sample length: 3404 +[default0]:Skipping sample id=160719. Maximum sequence length: 2049, sample length: 2311 +[default0]:Skipping sample id=937124. Maximum sequence length: 2049, sample length: 2931 +[default0]:Skipping sample id=993710. Maximum sequence length: 2049, sample length: 4818 +[default0]:Skipping sample id=12613. Maximum sequence length: 2049, sample length: 2280 +[default0]:Skipping sample id=3025. Maximum sequence length: 2049, sample length: 2335 +[default0]:Skipping sample id=667985. Maximum sequence length: 2049, sample length: 2290 +[default0]:Skipping sample id=860101. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=1495764. Maximum sequence length: 2049, sample length: 2535 +[default0]:Skipping sample id=1194602. Maximum sequence length: 2049, sample length: 2366 +[default0]:Skipping sample id=1467987. Maximum sequence length: 2049, sample length: 3274 +[default0]:Skipping sample id=1348047. Maximum sequence length: 2049, sample length: 3069 +[default0]:Skipping sample id=316984. Maximum sequence length: 2049, sample length: 2249 +[default0]:Skipping sample id=762319. Maximum sequence length: 2049, sample length: 2504 +[default0]:Skipping sample id=1057036. Maximum sequence length: 2049, sample length: 2357 +[default0]:Skipping sample id=742311. Maximum sequence length: 2049, sample length: 2695 +[default0]:Skipping sample id=394500. Maximum sequence length: 2049, sample length: 2569 +[default0]:Skipping sample id=897180. Maximum sequence length: 2049, sample length: 2561 +[default0]:Skipping sample id=787723. Maximum sequence length: 2049, sample length: 2744 +[default0]:Skipping sample id=490705. Maximum sequence length: 2049, sample length: 3797 +[default0]:Skipping sample id=591818. Maximum sequence length: 2049, sample length: 3159 +[default0]:Skipping sample id=838241. Maximum sequence length: 2049, sample length: 3173 +[default0]:Skipping sample id=94298. Maximum sequence length: 2049, sample length: 3099 +[default0]:Skipping sample id=1504020. Maximum sequence length: 2049, sample length: 3343 +[default0]:Skipping sample id=733635. Maximum sequence length: 2049, sample length: 4446 +[default0]:Skipping sample id=578294. Maximum sequence length: 2049, sample length: 3052 +[default0]:Skipping sample id=74958. Maximum sequence length: 2049, sample length: 2329 +[default0]:Skipping sample id=492822. Maximum sequence length: 2049, sample length: 2574 +[default0]:Skipping sample id=555175. Maximum sequence length: 2049, sample length: 3869 +[default0]:Skipping sample id=99312. Maximum sequence length: 2049, sample length: 3159 +[default0]:Skipping sample id=1475854. Maximum sequence length: 2049, sample length: 3115 +[default0]:Skipping sample id=603217. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=762206. Maximum sequence length: 2049, sample length: 2755 +[default0]:Skipping sample id=608408. Maximum sequence length: 2049, sample length: 2592 +[default0]:Skipping sample id=72614. Maximum sequence length: 2049, sample length: 2901 +[default0]:Skipping sample id=1566587. Maximum sequence length: 2049, sample length: 3591 +[default0]:Skipping sample id=1248532. Maximum sequence length: 2049, sample length: 2736 +[default0]:Skipping sample id=581401. Maximum sequence length: 2049, sample length: 2708 +[default0]:Skipping sample id=611418. Maximum sequence length: 2049, sample length: 2105 +[default0]:Skipping sample id=1027302. Maximum sequence length: 2049, sample length: 2115 +[default0]:Skipping sample id=594859. Maximum sequence length: 2049, sample length: 3633 +[default0]:Skipping sample id=1295643. Maximum sequence length: 2049, sample length: 2397 +[default0]:Skipping sample id=843429. Maximum sequence length: 2049, sample length: 3345 +[default0]:Skipping sample id=1474854. Maximum sequence length: 2049, sample length: 3123 +[default0]:Skipping sample id=1439540. Maximum sequence length: 2049, sample length: 2167 +[default0]:Skipping sample id=292091. Maximum sequence length: 2049, sample length: 2904 +[default0]:Skipping sample id=611246. Maximum sequence length: 2049, sample length: 4135 +[default0]:Skipping sample id=839667. Maximum sequence length: 2049, sample length: 2114 +[default0]:Skipping sample id=1160693. Maximum sequence length: 2049, sample length: 6032 +[default0]:Skipping sample id=90453. Maximum sequence length: 2049, sample length: 2100 +[default0]:Skipping sample id=1252464. Maximum sequence length: 2049, sample length: 2404 +[default0]:Skipping sample id=1026147. Maximum sequence length: 2049, sample length: 2243 +[default0]:Skipping sample id=88956. Maximum sequence length: 2049, sample length: 2805 +[default0]:Skipping sample id=592816. Maximum sequence length: 2049, sample length: 2210 +[default0]:Skipping sample id=420102. Maximum sequence length: 2049, sample length: 2693 +[default0]:Skipping sample id=1360044. Maximum sequence length: 2049, sample length: 3133 +[default0]:Skipping sample id=402560. Maximum sequence length: 2049, sample length: 2141 +[default0]:Skipping sample id=231277. Maximum sequence length: 2049, sample length: 2726 +[default0]:Skipping sample id=514222. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=1301634. Maximum sequence length: 2049, sample length: 2124 +[default0]:Skipping sample id=1164949. Maximum sequence length: 2049, sample length: 4950 +[default0]:Skipping sample id=943775. Maximum sequence length: 2049, sample length: 5550 +[default0]:Skipping sample id=68453. Maximum sequence length: 2049, sample length: 3857 +[default0]:Skipping sample id=521446. Maximum sequence length: 2049, sample length: 3596 +[default0]:Skipping sample id=645313. Maximum sequence length: 2049, sample length: 2564 +[default0]:Skipping sample id=60156. Maximum sequence length: 2049, sample length: 2084 +[default0]:Skipping sample id=135563. Maximum sequence length: 2049, sample length: 2583 +[default0]:Skipping sample id=1098475. Maximum sequence length: 2049, sample length: 3130 +[default0]:Skipping sample id=1504893. Maximum sequence length: 2049, sample length: 2307 +[default0]:Skipping sample id=1032874. Maximum sequence length: 2049, sample length: 2374 +[default0]:Skipping sample id=1021826. Maximum sequence length: 2049, sample length: 2351 +[default0]:Skipping sample id=746443. Maximum sequence length: 2049, sample length: 2407 +[default0]:Skipping sample id=619756. Maximum sequence length: 2049, sample length: 3085 +[default0]:Skipping sample id=163298. Maximum sequence length: 2049, sample length: 3049 +[default0]:Skipping sample id=259051. Maximum sequence length: 2049, sample length: 3089 +[default0]:Skipping sample id=1329825. Maximum sequence length: 2049, sample length: 2171 +[default0]:Skipping sample id=863100. Maximum sequence length: 2049, sample length: 3989 +[default0]:Skipping sample id=1023444. Maximum sequence length: 2049, sample length: 2947 +[default0]:Skipping sample id=30036. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=1417197. Maximum sequence length: 2049, sample length: 6587 +[default0]:Skipping sample id=523274. Maximum sequence length: 2049, sample length: 2192 +[default0]:Skipping sample id=690974. Maximum sequence length: 2049, sample length: 2196 +[default0]:Skipping sample id=841979. Maximum sequence length: 2049, sample length: 2155 +[default0]:Skipping sample id=9127. Maximum sequence length: 2049, sample length: 2611 +[default0]:Skipping sample id=136730. Maximum sequence length: 2049, sample length: 2449 +[default0]:Skipping sample id=783128. Maximum sequence length: 2049, sample length: 2085 +[default0]:Skipping sample id=1464996. Maximum sequence length: 2049, sample length: 3621 +[default0]:Skipping sample id=407190. Maximum sequence length: 2049, sample length: 2221 +[default0]:Skipping sample id=558528. Maximum sequence length: 2049, sample length: 2128 +[default0]:Skipping sample id=1396187. Maximum sequence length: 2049, sample length: 3471 +[default0]:Skipping sample id=123846. Maximum sequence length: 2049, sample length: 2874 +[default0]:Skipping sample id=881963. Maximum sequence length: 2049, sample length: 2767 +[default0]:Skipping sample id=797923. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=889529. Maximum sequence length: 2049, sample length: 2549 +[default0]:Skipping sample id=532896. Maximum sequence length: 2049, sample length: 2908 +[default0]:Skipping sample id=645304. Maximum sequence length: 2049, sample length: 4192 +[default0]:Skipping sample id=798023. Maximum sequence length: 2049, sample length: 2994 +[default0]:Skipping sample id=1468534. Maximum sequence length: 2049, sample length: 2159 +[default0]:Skipping sample id=562534. Maximum sequence length: 2049, sample length: 4226 +[default0]:Skipping sample id=1188985. Maximum sequence length: 2049, sample length: 2254 +[default0]:Skipping sample id=1473865. Maximum sequence length: 2049, sample length: 4047 +[default0]:Skipping sample id=1002303. Maximum sequence length: 2049, sample length: 2701 +[default0]:Skipping sample id=1493870. Maximum sequence length: 2049, sample length: 2107 +[default0]:Skipping sample id=1245828. Maximum sequence length: 2049, sample length: 2539 +[default0]:Skipping sample id=467333. Maximum sequence length: 2049, sample length: 3446 +[default0]:Skipping sample id=712326. Maximum sequence length: 2049, sample length: 4817 +[default0]:Skipping sample id=423533. Maximum sequence length: 2049, sample length: 2961 +[default0]:Skipping sample id=472034. Maximum sequence length: 2049, sample length: 3398 +[default0]:Skipping sample id=1055522. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=1463673. Maximum sequence length: 2049, sample length: 2378 +[default0]:Skipping sample id=792300. Maximum sequence length: 2049, sample length: 2737 +[default0]:Skipping sample id=498041. Maximum sequence length: 2049, sample length: 2220 +[default0]:Skipping sample id=1230561. Maximum sequence length: 2049, sample length: 4724 +[default0]:Skipping sample id=1381659. Maximum sequence length: 2049, sample length: 2409 +[default0]:Skipping sample id=393286. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=642975. Maximum sequence length: 2049, sample length: 2429 +[default0]:Skipping sample id=141513. Maximum sequence length: 2049, sample length: 2582 +[default0]:Skipping sample id=341097. Maximum sequence length: 2049, sample length: 2701 +[default0]:Skipping sample id=765428. Maximum sequence length: 2049, sample length: 2916 +[default0]:Skipping sample id=1422316. Maximum sequence length: 2049, sample length: 2221 +[default0]:Skipping sample id=1089257. Maximum sequence length: 2049, sample length: 3709 +[default0]:Skipping sample id=188878. Maximum sequence length: 2049, sample length: 2081 +[default0]:Skipping sample id=1167420. Maximum sequence length: 2049, sample length: 2291 +[default0]:Skipping sample id=1487675. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=901693. Maximum sequence length: 2049, sample length: 2085 +[default0]:Skipping sample id=207544. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=1353988. Maximum sequence length: 2049, sample length: 2258 +[default0]:Skipping sample id=678975. Maximum sequence length: 2049, sample length: 3103 +[default0]:Skipping sample id=394216. Maximum sequence length: 2049, sample length: 3540 +[default0]:Skipping sample id=1189487. Maximum sequence length: 2049, sample length: 2560 +[default0]:Skipping sample id=1546756. Maximum sequence length: 2049, sample length: 3011 +[default0]:Skipping sample id=851747. Maximum sequence length: 2049, sample length: 2223 +[default0]:Skipping sample id=1020691. Maximum sequence length: 2049, sample length: 3061 +[default0]:Skipping sample id=156513. Maximum sequence length: 2049, sample length: 2269 +[default0]:Skipping sample id=949561. Maximum sequence length: 2049, sample length: 3341 +[default0]:Skipping sample id=1172080. Maximum sequence length: 2049, sample length: 2969 +[default0]:Skipping sample id=1052153. Maximum sequence length: 2049, sample length: 2821 +[default0]:Skipping sample id=17366. Maximum sequence length: 2049, sample length: 4488 +[default0]:Skipping sample id=81308. Maximum sequence length: 2049, sample length: 2902 +[default0]:Skipping sample id=1165825. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=338882. Maximum sequence length: 2049, sample length: 2268 +[default0]:Skipping sample id=756816. Maximum sequence length: 2049, sample length: 3734 +[default0]:Skipping sample id=1471162. Maximum sequence length: 2049, sample length: 4320 +[default0]:Skipping sample id=916237. Maximum sequence length: 2049, sample length: 2387 +[default0]:Skipping sample id=953323. Maximum sequence length: 2049, sample length: 2639 +[default0]:Skipping sample id=791459. Maximum sequence length: 2049, sample length: 3438 +[default0]:Skipping sample id=279096. Maximum sequence length: 2049, sample length: 2619 +[default0]:Skipping sample id=395236. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=136504. Maximum sequence length: 2049, sample length: 3620 +[default0]:Skipping sample id=627608. Maximum sequence length: 2049, sample length: 2143 +[default0]:Skipping sample id=1209421. Maximum sequence length: 2049, sample length: 4660 +[default0]:Skipping sample id=1386240. Maximum sequence length: 2049, sample length: 2286 +[default0]:Skipping sample id=1329975. Maximum sequence length: 2049, sample length: 3895 +[default0]:Skipping sample id=613704. Maximum sequence length: 2049, sample length: 3170 +[default0]:Skipping sample id=658291. Maximum sequence length: 2049, sample length: 2568 +[default0]:Skipping sample id=431532. Maximum sequence length: 2049, sample length: 2351 +[default0]:Skipping sample id=696475. Maximum sequence length: 2049, sample length: 6149 +[default0]:Skipping sample id=601327. Maximum sequence length: 2049, sample length: 2374 +[default0]:Skipping sample id=1167464. Maximum sequence length: 2049, sample length: 4752 +[default0]:Skipping sample id=1516677. Maximum sequence length: 2049, sample length: 2383 +[default0]:Skipping sample id=1167736. Maximum sequence length: 2049, sample length: 2154 +[default0]:Skipping sample id=987714. Maximum sequence length: 2049, sample length: 2208 +[default0]:Skipping sample id=158995. Maximum sequence length: 2049, sample length: 3297 +[default0]:Skipping sample id=970433. Maximum sequence length: 2049, sample length: 4078 +[default0]:Skipping sample id=457625. Maximum sequence length: 2049, sample length: 3706 +[default0]:Skipping sample id=1494161. Maximum sequence length: 2049, sample length: 4646 +[default0]:Skipping sample id=518760. Maximum sequence length: 2049, sample length: 3462 +[default0]:Skipping sample id=1194733. Maximum sequence length: 2049, sample length: 2514 +[default0]:Skipping sample id=1006935. Maximum sequence length: 2049, sample length: 2720 +[default0]:Skipping sample id=675157. Maximum sequence length: 2049, sample length: 2986 +[default0]:Skipping sample id=1445141. Maximum sequence length: 2049, sample length: 2398 +[default0]:Skipping sample id=546092. Maximum sequence length: 2049, sample length: 4193 +[default0]:Skipping sample id=788441. Maximum sequence length: 2049, sample length: 2122 +[default0]:Skipping sample id=929272. Maximum sequence length: 2049, sample length: 2717 +[default0]:Skipping sample id=64116. Maximum sequence length: 2049, sample length: 3688 +[default0]:Skipping sample id=883723. Maximum sequence length: 2049, sample length: 2321 +[default0]:Skipping sample id=1125975. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=1076985. Maximum sequence length: 2049, sample length: 2431 +[default0]:Skipping sample id=68333. Maximum sequence length: 2049, sample length: 2278 +[default0]:Skipping sample id=807938. Maximum sequence length: 2049, sample length: 2356 +[default0]:Skipping sample id=1435401. Maximum sequence length: 2049, sample length: 2707 +[default0]:Skipping sample id=639979. Maximum sequence length: 2049, sample length: 2137 +[default0]:Skipping sample id=50524. Maximum sequence length: 2049, sample length: 2241 +[default0]:Skipping sample id=164778. Maximum sequence length: 2049, sample length: 2079 +[default0]:Skipping sample id=1366480. Maximum sequence length: 2049, sample length: 2127 +[default0]:Skipping sample id=1038665. Maximum sequence length: 2049, sample length: 2314 +[default0]:Skipping sample id=1282328. Maximum sequence length: 2049, sample length: 3651 +[default0]:Skipping sample id=417005. Maximum sequence length: 2049, sample length: 2182 +[default0]:Skipping sample id=488649. Maximum sequence length: 2049, sample length: 2456 +[default0]:Skipping sample id=1450266. Maximum sequence length: 2049, sample length: 3020 +[default0]:Skipping sample id=850917. Maximum sequence length: 2049, sample length: 2434 +[default0]:Skipping sample id=970663. Maximum sequence length: 2049, sample length: 3708 +[default0]:Skipping sample id=879186. Maximum sequence length: 2049, sample length: 4307 +[default0]:Skipping sample id=659259. Maximum sequence length: 2049, sample length: 3610 +[default0]:Skipping sample id=176038. Maximum sequence length: 2049, sample length: 3096 +[default0]:Skipping sample id=1239783. Maximum sequence length: 2049, sample length: 3544 +[default0]:Skipping sample id=898196. Maximum sequence length: 2049, sample length: 2629 +[default0]:Skipping sample id=799355. Maximum sequence length: 2049, sample length: 2606 +[default0]:Skipping sample id=754894. Maximum sequence length: 2049, sample length: 2819 +[default0]:Skipping sample id=1294243. Maximum sequence length: 2049, sample length: 3014 +[default0]:Skipping sample id=638188. Maximum sequence length: 2049, sample length: 3657 +[default0]:Skipping sample id=1194107. Maximum sequence length: 2049, sample length: 2638 +[default0]:Skipping sample id=416757. Maximum sequence length: 2049, sample length: 2332 +[default0]:Skipping sample id=946140. Maximum sequence length: 2049, sample length: 4151 +[default0]:Skipping sample id=725751. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=1242096. Maximum sequence length: 2049, sample length: 3351 +[default0]:Skipping sample id=932245. Maximum sequence length: 2049, sample length: 2063 +[default0]:Skipping sample id=542028. Maximum sequence length: 2049, sample length: 2234 +[default0]:Skipping sample id=744908. Maximum sequence length: 2049, sample length: 2766 +[default0]:Skipping sample id=1401347. Maximum sequence length: 2049, sample length: 2454 +[default0]:Skipping sample id=933884. Maximum sequence length: 2049, sample length: 2673 +[default0]:Skipping sample id=41084. Maximum sequence length: 2049, sample length: 3052 +[default0]:Skipping sample id=1417343. Maximum sequence length: 2049, sample length: 5439 +[default0]:Skipping sample id=1456892. Maximum sequence length: 2049, sample length: 2617 +[default0]:Skipping sample id=1560320. Maximum sequence length: 2049, sample length: 2793 +[default0]:Skipping sample id=104468. Maximum sequence length: 2049, sample length: 4641 +[default0]:Skipping sample id=552091. Maximum sequence length: 2049, sample length: 2513 +[default0]:Skipping sample id=399501. Maximum sequence length: 2049, sample length: 2474 +[default0]:Skipping sample id=422139. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=254616. Maximum sequence length: 2049, sample length: 3325 +[default0]:Skipping sample id=315969. Maximum sequence length: 2049, sample length: 2214 +[default0]:Skipping sample id=266070. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=323803. Maximum sequence length: 2049, sample length: 2158 +[default0]:Skipping sample id=368104. Maximum sequence length: 2049, sample length: 2520 +[default0]:Skipping sample id=889736. Maximum sequence length: 2049, sample length: 2214 +[default0]:Skipping sample id=970493. Maximum sequence length: 2049, sample length: 2497 +[default0]:Skipping sample id=580019. Maximum sequence length: 2049, sample length: 2227 +[default0]:Skipping sample id=777908. Maximum sequence length: 2049, sample length: 4084 +[default0]:Skipping sample id=1254063. Maximum sequence length: 2049, sample length: 2241 +[default0]:Skipping sample id=1337384. Maximum sequence length: 2049, sample length: 3593 +[default0]:Skipping sample id=983495. Maximum sequence length: 2049, sample length: 3495 +[default0]:Skipping sample id=460732. Maximum sequence length: 2049, sample length: 3127 +[default0]:Skipping sample id=223169. Maximum sequence length: 2049, sample length: 4713 +[default0]:Skipping sample id=573830. Maximum sequence length: 2049, sample length: 5196 +[default0]:Skipping sample id=1561944. Maximum sequence length: 2049, sample length: 4239 +[default0]:Skipping sample id=848161. Maximum sequence length: 2049, sample length: 2775 +[default0]:Skipping sample id=746266. Maximum sequence length: 2049, sample length: 3749 +[default0]:Skipping sample id=550808. Maximum sequence length: 2049, sample length: 2280 +[default0]:Skipping sample id=135778. Maximum sequence length: 2049, sample length: 2501 +[default0]:Skipping sample id=69864. Maximum sequence length: 2049, sample length: 2552 +[default0]:Skipping sample id=1464953. Maximum sequence length: 2049, sample length: 4507 +[default0]:Skipping sample id=1377195. Maximum sequence length: 2049, sample length: 2870 +[default0]:Skipping sample id=747056. Maximum sequence length: 2049, sample length: 2081 +[default0]:Skipping sample id=302480. Maximum sequence length: 2049, sample length: 2385 +[default0]:Skipping sample id=1322216. Maximum sequence length: 2049, sample length: 2266 +[default0]:Skipping sample id=1495534. Maximum sequence length: 2049, sample length: 2388 +[default0]:Skipping sample id=540598. Maximum sequence length: 2049, sample length: 4606 +[default0]:Skipping sample id=830681. Maximum sequence length: 2049, sample length: 2140 +[default0]:Skipping sample id=8359. Maximum sequence length: 2049, sample length: 2192 +[default0]:Skipping sample id=624558. Maximum sequence length: 2049, sample length: 2103 +[default0]:Skipping sample id=843209. Maximum sequence length: 2049, sample length: 2638 +[default0]:Skipping sample id=905673. Maximum sequence length: 2049, sample length: 3087 +[default0]:Skipping sample id=369886. Maximum sequence length: 2049, sample length: 2604 +[default0]:Skipping sample id=1482051. Maximum sequence length: 2049, sample length: 4763 +[default0]:Skipping sample id=395070. Maximum sequence length: 2049, sample length: 2392 +[default0]:Skipping sample id=1338552. Maximum sequence length: 2049, sample length: 5646 +[default0]:Skipping sample id=1342044. Maximum sequence length: 2049, sample length: 4588 +[default0]:Skipping sample id=162603. Maximum sequence length: 2049, sample length: 2840 +[default0]:Skipping sample id=1423405. Maximum sequence length: 2049, sample length: 2708 +[default0]:Skipping sample id=482094. Maximum sequence length: 2049, sample length: 2072 +[default0]:Skipping sample id=288921. Maximum sequence length: 2049, sample length: 2952 +[default0]:Skipping sample id=806562. Maximum sequence length: 2049, sample length: 4030 +[default0]:Skipping sample id=927264. Maximum sequence length: 2049, sample length: 2362 +[default0]:Skipping sample id=971124. Maximum sequence length: 2049, sample length: 2344 +[default0]:Skipping sample id=415195. Maximum sequence length: 2049, sample length: 2373 +[default0]:Skipping sample id=487812. Maximum sequence length: 2049, sample length: 2449 +[default0]:Skipping sample id=1478503. Maximum sequence length: 2049, sample length: 2381 +[default0]:Skipping sample id=1455812. Maximum sequence length: 2049, sample length: 4204 +[default0]:Skipping sample id=1020088. Maximum sequence length: 2049, sample length: 2843 +[default0]:Skipping sample id=430914. Maximum sequence length: 2049, sample length: 2330 +[default0]:Skipping sample id=1202729. Maximum sequence length: 2049, sample length: 5921 +[default0]:Skipping sample id=1378023. Maximum sequence length: 2049, sample length: 3526 +[default0]:Skipping sample id=521061. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=94994. Maximum sequence length: 2049, sample length: 2989 +[default0]:Skipping sample id=320553. Maximum sequence length: 2049, sample length: 5865 +[default0]:Skipping sample id=379936. Maximum sequence length: 2049, sample length: 2079 +[default0]:Skipping sample id=1463773. Maximum sequence length: 2049, sample length: 3183 +[default0]:Skipping sample id=33301. Maximum sequence length: 2049, sample length: 2655 +[default0]:Skipping sample id=1203566. Maximum sequence length: 2049, sample length: 2889 +[default0]:Skipping sample id=161894. Maximum sequence length: 2049, sample length: 3891 +[default0]:Skipping sample id=800839. Maximum sequence length: 2049, sample length: 2929 +[default0]:Skipping sample id=31237. Maximum sequence length: 2049, sample length: 2441 +[default0]:Skipping sample id=209296. Maximum sequence length: 2049, sample length: 2762 +[default0]:Skipping sample id=25774. Maximum sequence length: 2049, sample length: 3303 +[default0]:Skipping sample id=1182539. Maximum sequence length: 2049, sample length: 4218 +[default0]:Skipping sample id=1359968. Maximum sequence length: 2049, sample length: 2876 +[default0]:Skipping sample id=1073618. Maximum sequence length: 2049, sample length: 3731 +[default0]:Skipping sample id=811693. Maximum sequence length: 2049, sample length: 2592 +[default0]:Skipping sample id=782512. Maximum sequence length: 2049, sample length: 2089 +[default0]:Skipping sample id=703795. Maximum sequence length: 2049, sample length: 2171 +[default0]:Skipping sample id=1417754. Maximum sequence length: 2049, sample length: 2354 +[default0]:Skipping sample id=316756. Maximum sequence length: 2049, sample length: 2419 +[default0]:Skipping sample id=535725. Maximum sequence length: 2049, sample length: 2553 +[default0]:Skipping sample id=275535. Maximum sequence length: 2049, sample length: 6756 +[default0]:Skipping sample id=716779. Maximum sequence length: 2049, sample length: 2752 +[default0]:Skipping sample id=1017816. Maximum sequence length: 2049, sample length: 2788 +[default0]:Skipping sample id=392055. Maximum sequence length: 2049, sample length: 2501 +[default0]:Skipping sample id=970230. Maximum sequence length: 2049, sample length: 2369 +[default0]:Skipping sample id=1392823. Maximum sequence length: 2049, sample length: 2197 +[default0]:Skipping sample id=108870. Maximum sequence length: 2049, sample length: 2642 +[default0]:Skipping sample id=504466. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=202167. Maximum sequence length: 2049, sample length: 2429 +[default0]:Skipping sample id=15154. Maximum sequence length: 2049, sample length: 2270 +[default0]:Skipping sample id=1540453. Maximum sequence length: 2049, sample length: 3591 +[default0]:Skipping sample id=898610. Maximum sequence length: 2049, sample length: 2762 +[default0]:Skipping sample id=920444. Maximum sequence length: 2049, sample length: 2192 +[default0]:Skipping sample id=1561306. Maximum sequence length: 2049, sample length: 2967 +[default0]:Skipping sample id=1024686. Maximum sequence length: 2049, sample length: 2239 +[default0]:Skipping sample id=924516. Maximum sequence length: 2049, sample length: 3521 +[default0]:Skipping sample id=178761. Maximum sequence length: 2049, sample length: 6285 +[default0]:Skipping sample id=1158968. Maximum sequence length: 2049, sample length: 2525 +[default0]:Skipping sample id=173680. Maximum sequence length: 2049, sample length: 2498 +[default0]:Skipping sample id=700253. Maximum sequence length: 2049, sample length: 2939 +[default0]:Skipping sample id=660028. Maximum sequence length: 2049, sample length: 3352 +[default0]:Skipping sample id=605980. Maximum sequence length: 2049, sample length: 4025 +[default0]:Skipping sample id=862429. Maximum sequence length: 2049, sample length: 2996 +[default0]:Skipping sample id=1482014. Maximum sequence length: 2049, sample length: 3395 +[default0]:Skipping sample id=1108491. Maximum sequence length: 2049, sample length: 2363 +[default0]:Skipping sample id=1517702. Maximum sequence length: 2049, sample length: 2325 +[default0]:Skipping sample id=596218. Maximum sequence length: 2049, sample length: 2432 +[default0]:Skipping sample id=867702. Maximum sequence length: 2049, sample length: 3112 +[default0]:Skipping sample id=63038. Maximum sequence length: 2049, sample length: 2385 +[default0]:Skipping sample id=346173. Maximum sequence length: 2049, sample length: 2727 +[default0]:Skipping sample id=1538927. Maximum sequence length: 2049, sample length: 2266 +[default0]:Skipping sample id=773050. Maximum sequence length: 2049, sample length: 3372 +[default0]:Skipping sample id=899219. Maximum sequence length: 2049, sample length: 2125 +[default0]:Skipping sample id=470750. Maximum sequence length: 2049, sample length: 2159 +[default0]:Skipping sample id=1402124. Maximum sequence length: 2049, sample length: 3490 +[default0]:Skipping sample id=1556275. Maximum sequence length: 2049, sample length: 3143 +[default0]:Skipping sample id=101602. Maximum sequence length: 2049, sample length: 2903 +[default0]:Skipping sample id=396093. Maximum sequence length: 2049, sample length: 2644 +[default0]:Skipping sample id=1471082. Maximum sequence length: 2049, sample length: 2118 +[default0]:Skipping sample id=387326. Maximum sequence length: 2049, sample length: 3015 +[default0]:Skipping sample id=1422047. Maximum sequence length: 2049, sample length: 2574 +[default0]:Skipping sample id=192260. Maximum sequence length: 2049, sample length: 5130 +[default0]:Skipping sample id=781833. Maximum sequence length: 2049, sample length: 2710 +[default0]:Skipping sample id=1391068. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=235173. Maximum sequence length: 2049, sample length: 3423 +[default0]:Skipping sample id=881192. Maximum sequence length: 2049, sample length: 2742 +[default0]:Skipping sample id=948969. Maximum sequence length: 2049, sample length: 2220 +[default0]:Skipping sample id=458907. Maximum sequence length: 2049, sample length: 2356 +[default0]:Skipping sample id=1176825. Maximum sequence length: 2049, sample length: 2756 +[default0]:Skipping sample id=1156683. Maximum sequence length: 2049, sample length: 2752 +[default0]:Skipping sample id=276748. Maximum sequence length: 2049, sample length: 2236 +[default0]:Skipping sample id=496764. Maximum sequence length: 2049, sample length: 3030 +[default0]:Skipping sample id=1088502. Maximum sequence length: 2049, sample length: 3224 +[default0]:Skipping sample id=1029838. Maximum sequence length: 2049, sample length: 4583 +[default0]:Skipping sample id=1346338. Maximum sequence length: 2049, sample length: 2621 +[default0]:Skipping sample id=1399099. Maximum sequence length: 2049, sample length: 2481 +[default0]:Skipping sample id=964875. Maximum sequence length: 2049, sample length: 2753 +[default0]:Skipping sample id=1261698. Maximum sequence length: 2049, sample length: 4250 +[default0]:Skipping sample id=1190585. Maximum sequence length: 2049, sample length: 3405 +[default0]:Skipping sample id=426818. Maximum sequence length: 2049, sample length: 2095 +[default0]:Skipping sample id=593743. Maximum sequence length: 2049, sample length: 2174 +[default0]:Skipping sample id=409445. Maximum sequence length: 2049, sample length: 3069 +[default0]:Skipping sample id=1063557. Maximum sequence length: 2049, sample length: 2298 +[default0]:Skipping sample id=1460075. Maximum sequence length: 2049, sample length: 2096 +[default0]:Skipping sample id=154771. Maximum sequence length: 2049, sample length: 2064 +[default0]:Skipping sample id=652938. Maximum sequence length: 2049, sample length: 3332 +[default0]:Skipping sample id=299926. Maximum sequence length: 2049, sample length: 2199 +[default0]:Skipping sample id=1330801. Maximum sequence length: 2049, sample length: 2483 +[default0]:Skipping sample id=587520. Maximum sequence length: 2049, sample length: 2574 +[default0]:Skipping sample id=717549. Maximum sequence length: 2049, sample length: 2087 +[default0]:Skipping sample id=391283. Maximum sequence length: 2049, sample length: 2616 +[default0]:Skipping sample id=1141761. Maximum sequence length: 2049, sample length: 2325 +[default0]:Skipping sample id=941238. Maximum sequence length: 2049, sample length: 5289 +[default0]:Skipping sample id=1546103. Maximum sequence length: 2049, sample length: 3069 +[default0]:Skipping sample id=1480422. Maximum sequence length: 2049, sample length: 2688 +[default0]:Skipping sample id=1454469. Maximum sequence length: 2049, sample length: 3149 +[default0]:Skipping sample id=169082. Maximum sequence length: 2049, sample length: 2143 +[default0]:Skipping sample id=1004510. Maximum sequence length: 2049, sample length: 2385 +[default0]:Skipping sample id=1545. Maximum sequence length: 2049, sample length: 2632 +[default0]:Skipping sample id=718069. Maximum sequence length: 2049, sample length: 2361 +[default0]:Skipping sample id=510799. Maximum sequence length: 2049, sample length: 8201 +[default0]:Skipping sample id=581365. Maximum sequence length: 2049, sample length: 2518 +[default0]:Skipping sample id=597587. Maximum sequence length: 2049, sample length: 2241 +[default0]:Skipping sample id=845137. Maximum sequence length: 2049, sample length: 3094 +[default0]:Skipping sample id=1180544. Maximum sequence length: 2049, sample length: 2234 +[default0]:Skipping sample id=1534642. Maximum sequence length: 2049, sample length: 3105 +[default0]:Skipping sample id=1124822. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=889601. Maximum sequence length: 2049, sample length: 4163 +[default0]:Skipping sample id=1059759. Maximum sequence length: 2049, sample length: 2250 +[default0]:Skipping sample id=993729. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=1469163. Maximum sequence length: 2049, sample length: 2758 +[default0]:Skipping sample id=1292711. Maximum sequence length: 2049, sample length: 4145 +[default0]:Skipping sample id=1348077. Maximum sequence length: 2049, sample length: 2432 +[default0]:Skipping sample id=1314971. Maximum sequence length: 2049, sample length: 2549 +[default0]:Skipping sample id=71198. Maximum sequence length: 2049, sample length: 5304 +[default0]:Skipping sample id=1426692. Maximum sequence length: 2049, sample length: 2829 +[default0]:Skipping sample id=64104. Maximum sequence length: 2049, sample length: 2713 +[default0]:Skipping sample id=1078801. Maximum sequence length: 2049, sample length: 2663 +[default0]:Skipping sample id=1438048. Maximum sequence length: 2049, sample length: 2296 +[default0]:Skipping sample id=885515. Maximum sequence length: 2049, sample length: 2528 +[default0]:Skipping sample id=1245344. Maximum sequence length: 2049, sample length: 2159 +[default0]:Skipping sample id=533887. Maximum sequence length: 2049, sample length: 2134 +[default0]:Skipping sample id=802688. Maximum sequence length: 2049, sample length: 2647 +[default0]:Skipping sample id=705789. Maximum sequence length: 2049, sample length: 3972 +[default0]:Skipping sample id=1034157. Maximum sequence length: 2049, sample length: 2206 +[default0]:Skipping sample id=225083. Maximum sequence length: 2049, sample length: 2695 +[default0]:Skipping sample id=268293. Maximum sequence length: 2049, sample length: 4465 +[default0]:Skipping sample id=227060. Maximum sequence length: 2049, sample length: 2718 +[default0]:Skipping sample id=742591. Maximum sequence length: 2049, sample length: 4653 +[default0]:Skipping sample id=470535. Maximum sequence length: 2049, sample length: 2226 +[default0]:Skipping sample id=68942. Maximum sequence length: 2049, sample length: 2201 +[default0]:Skipping sample id=340832. Maximum sequence length: 2049, sample length: 2393 +[default0]:Skipping sample id=1028460. Maximum sequence length: 2049, sample length: 3101 +[default0]:Skipping sample id=1131314. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=30652. Maximum sequence length: 2049, sample length: 4174 +[default0]:Skipping sample id=228444. Maximum sequence length: 2049, sample length: 2816 +[default0]:Skipping sample id=20048. Maximum sequence length: 2049, sample length: 3016 +[default0]:Skipping sample id=848969. Maximum sequence length: 2049, sample length: 4084 +[default0]:Skipping sample id=545205. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=749931. Maximum sequence length: 2049, sample length: 3229 +[default0]:Skipping sample id=620870. Maximum sequence length: 2049, sample length: 2248 +[default0]:Skipping sample id=1053136. Maximum sequence length: 2049, sample length: 2699 +[default0]:Skipping sample id=647552. Maximum sequence length: 2049, sample length: 3414 +[default0]:Skipping sample id=330563. Maximum sequence length: 2049, sample length: 2174 +[default0]:Skipping sample id=975489. Maximum sequence length: 2049, sample length: 2621 +[default0]:Skipping sample id=1349814. Maximum sequence length: 2049, sample length: 3656 +[default0]:Skipping sample id=1287652. Maximum sequence length: 2049, sample length: 3202 +[default0]:Skipping sample id=1547915. Maximum sequence length: 2049, sample length: 3252 +[default0]:Skipping sample id=23100. Maximum sequence length: 2049, sample length: 2073 +[default0]:Skipping sample id=1032520. Maximum sequence length: 2049, sample length: 4230 +[default0]:Skipping sample id=1129593. Maximum sequence length: 2049, sample length: 2080 +[default0]:Skipping sample id=253428. Maximum sequence length: 2049, sample length: 3209 +[default0]:Skipping sample id=1053565. Maximum sequence length: 2049, sample length: 3976 +[default0]:Skipping sample id=976290. Maximum sequence length: 2049, sample length: 4983 +[default0]:Skipping sample id=184805. Maximum sequence length: 2049, sample length: 2534 +[default0]:Skipping sample id=1561925. Maximum sequence length: 2049, sample length: 2763 +[default0]:Skipping sample id=1171407. Maximum sequence length: 2049, sample length: 2232 +[default0]:Skipping sample id=1340104. Maximum sequence length: 2049, sample length: 2978 +[default0]:Skipping sample id=933530. Maximum sequence length: 2049, sample length: 2389 +[default0]:Skipping sample id=681616. Maximum sequence length: 2049, sample length: 4967 +[default0]:Skipping sample id=1080494. Maximum sequence length: 2049, sample length: 2703 +[default0]:Skipping sample id=1205545. Maximum sequence length: 2049, sample length: 4316 +[default0]:Skipping sample id=179347. Maximum sequence length: 2049, sample length: 2522 +[default0]:Skipping sample id=1320152. Maximum sequence length: 2049, sample length: 2566 +[default0]:Skipping sample id=214772. Maximum sequence length: 2049, sample length: 2559 +[default0]:Skipping sample id=818508. Maximum sequence length: 2049, sample length: 3322 +[default0]:Skipping sample id=1360457. Maximum sequence length: 2049, sample length: 4162 +[default0]:Skipping sample id=11410. Maximum sequence length: 2049, sample length: 2486 +[default0]:Skipping sample id=857633. Maximum sequence length: 2049, sample length: 2891 +[default0]:Skipping sample id=1409477. Maximum sequence length: 2049, sample length: 7252 +[default0]:Skipping sample id=1359715. Maximum sequence length: 2049, sample length: 2906 +[default0]:Skipping sample id=729602. Maximum sequence length: 2049, sample length: 5937 +[default0]:Skipping sample id=988212. Maximum sequence length: 2049, sample length: 3451 +[default0]:Skipping sample id=387314. Maximum sequence length: 2049, sample length: 3166 +[default0]:Skipping sample id=55742. Maximum sequence length: 2049, sample length: 3491 +[default0]:Skipping sample id=1276699. Maximum sequence length: 2049, sample length: 4609 +[default0]:Skipping sample id=1259392. Maximum sequence length: 2049, sample length: 2215 +[default0]:Skipping sample id=934418. Maximum sequence length: 2049, sample length: 2327 +[default0]:Skipping sample id=59307. Maximum sequence length: 2049, sample length: 2615 +[default0]:Skipping sample id=662568. Maximum sequence length: 2049, sample length: 4690 +[default0]:Skipping sample id=1498957. Maximum sequence length: 2049, sample length: 2202 +[default0]:Skipping sample id=1523257. Maximum sequence length: 2049, sample length: 2182 +[default0]:Skipping sample id=906848. Maximum sequence length: 2049, sample length: 2506 +[default0]:Skipping sample id=179671. Maximum sequence length: 2049, sample length: 3127 +[default0]:Skipping sample id=530540. Maximum sequence length: 2049, sample length: 4292 +[default0]:Skipping sample id=1422853. Maximum sequence length: 2049, sample length: 4021 +[default0]:Skipping sample id=537988. Maximum sequence length: 2049, sample length: 3489 +[default0]:Skipping sample id=1313573. Maximum sequence length: 2049, sample length: 2666 +[default0]:Skipping sample id=312619. Maximum sequence length: 2049, sample length: 3656 +[default0]:Skipping sample id=1253972. Maximum sequence length: 2049, sample length: 2428 +[default0]:Skipping sample id=1417031. Maximum sequence length: 2049, sample length: 2235 +[default0]:Skipping sample id=1218912. Maximum sequence length: 2049, sample length: 2054 +[default0]:Skipping sample id=1157338. Maximum sequence length: 2049, sample length: 2251 +[default0]:Skipping sample id=626235. Maximum sequence length: 2049, sample length: 2118 +[default0]:Skipping sample id=1054029. Maximum sequence length: 2049, sample length: 3138 +[default0]:Skipping sample id=780098. Maximum sequence length: 2049, sample length: 2504 +[default0]:Skipping sample id=1497162. Maximum sequence length: 2049, sample length: 2193 +[default0]:Skipping sample id=1107133. Maximum sequence length: 2049, sample length: 2722 +[default0]:Skipping sample id=654002. Maximum sequence length: 2049, sample length: 2871 +[default0]:Skipping sample id=863193. Maximum sequence length: 2049, sample length: 2785 +[default0]:Skipping sample id=276027. Maximum sequence length: 2049, sample length: 2880 +[default0]:Skipping sample id=1233350. Maximum sequence length: 2049, sample length: 2191 +[default0]:Skipping sample id=210824. Maximum sequence length: 2049, sample length: 2604 +[default0]:Skipping sample id=879851. Maximum sequence length: 2049, sample length: 4150 +[default0]:Skipping sample id=600189. Maximum sequence length: 2049, sample length: 3366 +[default0]:Skipping sample id=1501769. Maximum sequence length: 2049, sample length: 3309 +[default0]:Skipping sample id=56895. Maximum sequence length: 2049, sample length: 2554 +[default0]:Skipping sample id=567448. Maximum sequence length: 2049, sample length: 2813 +[default0]:Skipping sample id=1104336. Maximum sequence length: 2049, sample length: 2740 +[default0]:Skipping sample id=374559. Maximum sequence length: 2049, sample length: 3492 +[default0]:Skipping sample id=1393114. Maximum sequence length: 2049, sample length: 3045 +[default0]:Skipping sample id=636797. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=620986. Maximum sequence length: 2049, sample length: 3366 +[default0]:Skipping sample id=675276. Maximum sequence length: 2049, sample length: 2896 +[default0]:Skipping sample id=1455942. Maximum sequence length: 2049, sample length: 4323 +[default0]:Skipping sample id=104913. Maximum sequence length: 2049, sample length: 3645 +[default0]:Skipping sample id=649830. Maximum sequence length: 2049, sample length: 2468 +[default0]:Skipping sample id=852697. Maximum sequence length: 2049, sample length: 2362 +[default0]:Skipping sample id=495537. Maximum sequence length: 2049, sample length: 2253 +[default0]:Skipping sample id=1341785. Maximum sequence length: 2049, sample length: 2176 +[default0]:Skipping sample id=1518223. Maximum sequence length: 2049, sample length: 6066 +[default0]:Skipping sample id=947798. Maximum sequence length: 2049, sample length: 2257 +[default0]:Skipping sample id=320320. Maximum sequence length: 2049, sample length: 2395 +[default0]:Skipping sample id=802755. Maximum sequence length: 2049, sample length: 3246 +[default0]:Skipping sample id=764706. Maximum sequence length: 2049, sample length: 3828 +[default0]:Skipping sample id=389085. Maximum sequence length: 2049, sample length: 3804 +[default0]:Skipping sample id=1046328. Maximum sequence length: 2049, sample length: 3728 +[default0]:Skipping sample id=459092. Maximum sequence length: 2049, sample length: 2824 +[default0]:Skipping sample id=135523. Maximum sequence length: 2049, sample length: 2822 +[default0]:Skipping sample id=451725. Maximum sequence length: 2049, sample length: 2378 +[default0]:Skipping sample id=947294. Maximum sequence length: 2049, sample length: 5104 +[default0]:Skipping sample id=728576. Maximum sequence length: 2049, sample length: 3363 +[default0]:Skipping sample id=192018. Maximum sequence length: 2049, sample length: 3994 +[default0]:Skipping sample id=1088596. Maximum sequence length: 2049, sample length: 3099 +[default0]:Skipping sample id=1480756. Maximum sequence length: 2049, sample length: 4848 +[default0]:Skipping sample id=784237. Maximum sequence length: 2049, sample length: 2381 +[default0]:Skipping sample id=929817. Maximum sequence length: 2049, sample length: 2855 +[default0]:Skipping sample id=467884. Maximum sequence length: 2049, sample length: 2993 +[default0]:Skipping sample id=414800. Maximum sequence length: 2049, sample length: 2674 +[default0]:Skipping sample id=1444975. Maximum sequence length: 2049, sample length: 2571 +[default0]:Skipping sample id=1545143. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=1455735. Maximum sequence length: 2049, sample length: 3131 +[default0]:Skipping sample id=544007. Maximum sequence length: 2049, sample length: 2107 +[default0]:Skipping sample id=1006973. Maximum sequence length: 2049, sample length: 2183 +[default0]:Skipping sample id=53433. Maximum sequence length: 2049, sample length: 4501 +[default0]:Skipping sample id=552060. Maximum sequence length: 2049, sample length: 2112 +[default0]:Skipping sample id=1106887. Maximum sequence length: 2049, sample length: 2896 +[default0]:Skipping sample id=320460. Maximum sequence length: 2049, sample length: 2166 +[default0]:Skipping sample id=512057. Maximum sequence length: 2049, sample length: 2957 +[default0]:Skipping sample id=635050. Maximum sequence length: 2049, sample length: 3510 +[default0]:Skipping sample id=270164. Maximum sequence length: 2049, sample length: 3489 +[default0]:Skipping sample id=1404935. Maximum sequence length: 2049, sample length: 2562 +[default0]:Skipping sample id=357288. Maximum sequence length: 2049, sample length: 5075 +[default0]:Skipping sample id=706199. Maximum sequence length: 2049, sample length: 4624 +[default0]:Skipping sample id=1398250. Maximum sequence length: 2049, sample length: 2322 +[default0]:Skipping sample id=920728. Maximum sequence length: 2049, sample length: 3102 +[default0]:Skipping sample id=864306. Maximum sequence length: 2049, sample length: 7844 +[default0]:Skipping sample id=1364051. Maximum sequence length: 2049, sample length: 2968 +[default0]:Skipping sample id=162012. Maximum sequence length: 2049, sample length: 2377 +[default0]:Skipping sample id=889468. Maximum sequence length: 2049, sample length: 2769 +[default0]:Skipping sample id=8544. Maximum sequence length: 2049, sample length: 2806 +[default0]:Skipping sample id=1078312. Maximum sequence length: 2049, sample length: 2171 +[default0]:Skipping sample id=1018165. Maximum sequence length: 2049, sample length: 2500 +[default0]:Skipping sample id=1532954. Maximum sequence length: 2049, sample length: 2145 +[default0]:Skipping sample id=411212. Maximum sequence length: 2049, sample length: 2053 +[default0]:Skipping sample id=484106. Maximum sequence length: 2049, sample length: 2244 +[default0]:Skipping sample id=1073082. Maximum sequence length: 2049, sample length: 3499 +[default0]:Skipping sample id=1035961. Maximum sequence length: 2049, sample length: 4967 +[default0]:Skipping sample id=478609. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=831891. Maximum sequence length: 2049, sample length: 3397 +[default0]:Skipping sample id=127907. Maximum sequence length: 2049, sample length: 2172 +[default0]:Skipping sample id=700902. Maximum sequence length: 2049, sample length: 2113 +[default0]:Skipping sample id=743276. Maximum sequence length: 2049, sample length: 2517 +[default0]:Skipping sample id=74458. Maximum sequence length: 2049, sample length: 2282 +[default0]:Skipping sample id=521954. Maximum sequence length: 2049, sample length: 3604 +[default0]:Skipping sample id=294547. Maximum sequence length: 2049, sample length: 4461 +[default0]:Skipping sample id=66044. Maximum sequence length: 2049, sample length: 2493 +[default0]:Skipping sample id=1400587. Maximum sequence length: 2049, sample length: 5646 +[default0]:Skipping sample id=12921. Maximum sequence length: 2049, sample length: 2328 +[default0]:Skipping sample id=1160326. Maximum sequence length: 2049, sample length: 2733 +[default0]:Skipping sample id=649791. Maximum sequence length: 2049, sample length: 8012 +[default0]:Skipping sample id=613602. Maximum sequence length: 2049, sample length: 3482 +[default0]:Skipping sample id=396405. Maximum sequence length: 2049, sample length: 3440 +[default0]:Skipping sample id=711117. Maximum sequence length: 2049, sample length: 3405 +[default0]:Skipping sample id=589886. Maximum sequence length: 2049, sample length: 3302 +[default0]:Skipping sample id=1174420. Maximum sequence length: 2049, sample length: 2319 +[default0]:Skipping sample id=601189. Maximum sequence length: 2049, sample length: 3461 +[default0]:Skipping sample id=628386. Maximum sequence length: 2049, sample length: 2158 +[default0]:Skipping sample id=444177. Maximum sequence length: 2049, sample length: 6239 +[default0]:Skipping sample id=302732. Maximum sequence length: 2049, sample length: 3320 +[default0]:Skipping sample id=509115. Maximum sequence length: 2049, sample length: 2628 +[default0]:Skipping sample id=706931. Maximum sequence length: 2049, sample length: 2310 +[default0]:Skipping sample id=1521225. Maximum sequence length: 2049, sample length: 6402 +[default0]:Skipping sample id=635236. Maximum sequence length: 2049, sample length: 2268 +[default0]:Skipping sample id=70142. Maximum sequence length: 2049, sample length: 2763 +[default0]:Skipping sample id=303073. Maximum sequence length: 2049, sample length: 2287 +[default0]:Skipping sample id=114963. Maximum sequence length: 2049, sample length: 2227 +[default0]:Skipping sample id=120971. Maximum sequence length: 2049, sample length: 2468 +[default0]:Skipping sample id=998071. Maximum sequence length: 2049, sample length: 2237 +[default0]:Skipping sample id=985685. Maximum sequence length: 2049, sample length: 2414 +[default0]:Skipping sample id=105453. Maximum sequence length: 2049, sample length: 2560 +[default0]:Skipping sample id=713093. Maximum sequence length: 2049, sample length: 2688 +[default0]:Skipping sample id=50421. Maximum sequence length: 2049, sample length: 2104 +[default0]:Skipping sample id=643201. Maximum sequence length: 2049, sample length: 2782 +[default0]:Skipping sample id=734415. Maximum sequence length: 2049, sample length: 3860 +[default0]:Skipping sample id=1558299. Maximum sequence length: 2049, sample length: 3774 +[default0]:Skipping sample id=297354. Maximum sequence length: 2049, sample length: 2616 +[default0]:Skipping sample id=1186162. Maximum sequence length: 2049, sample length: 3540 +[default0]:Skipping sample id=1261767. Maximum sequence length: 2049, sample length: 3191 +[default0]:Skipping sample id=227197. Maximum sequence length: 2049, sample length: 4014 +[default0]:Skipping sample id=1268321. Maximum sequence length: 2049, sample length: 2925 +[default0]:Skipping sample id=220488. Maximum sequence length: 2049, sample length: 2335 +[default0]:Skipping sample id=16940. Maximum sequence length: 2049, sample length: 2142 +[default0]:Skipping sample id=51689. Maximum sequence length: 2049, sample length: 2718 +[default0]:Skipping sample id=13994. Maximum sequence length: 2049, sample length: 2513 +[default0]:Skipping sample id=1057862. Maximum sequence length: 2049, sample length: 2529 +[default0]:Skipping sample id=195363. Maximum sequence length: 2049, sample length: 2340 +[default0]:Skipping sample id=132885. Maximum sequence length: 2049, sample length: 3200 +[default0]:Skipping sample id=1288770. Maximum sequence length: 2049, sample length: 3531 +[default0]:Skipping sample id=363332. Maximum sequence length: 2049, sample length: 2249 +[default0]:Skipping sample id=964388. Maximum sequence length: 2049, sample length: 5223 +[default0]:Skipping sample id=1282757. Maximum sequence length: 2049, sample length: 3122 +[default0]:Skipping sample id=942979. Maximum sequence length: 2049, sample length: 2132 +[default0]:Skipping sample id=1046247. Maximum sequence length: 2049, sample length: 3886 +[default0]:Skipping sample id=74619. Maximum sequence length: 2049, sample length: 2137 +[default0]:Skipping sample id=1155371. Maximum sequence length: 2049, sample length: 2403 +[default0]:Skipping sample id=948700. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=325301. Maximum sequence length: 2049, sample length: 5006 +[default0]:Skipping sample id=464393. Maximum sequence length: 2049, sample length: 2233 +[default0]:Skipping sample id=1090621. Maximum sequence length: 2049, sample length: 2983 +[default0]:Skipping sample id=1061984. Maximum sequence length: 2049, sample length: 3478 +[default0]:Skipping sample id=871057. Maximum sequence length: 2049, sample length: 2974 +[default0]:Skipping sample id=492242. Maximum sequence length: 2049, sample length: 6083 +[default0]:Skipping sample id=1364180. Maximum sequence length: 2049, sample length: 4142 +[default0]:Skipping sample id=252985. Maximum sequence length: 2049, sample length: 3926 +[default0]:Skipping sample id=54812. Maximum sequence length: 2049, sample length: 2083 +[default0]:Skipping sample id=1168539. Maximum sequence length: 2049, sample length: 2417 +[default0]:Skipping sample id=366791. Maximum sequence length: 2049, sample length: 2283 +[default0]:Skipping sample id=759414. Maximum sequence length: 2049, sample length: 2457 +[default0]:Skipping sample id=616626. Maximum sequence length: 2049, sample length: 2276 +[default0]:Skipping sample id=1250828. Maximum sequence length: 2049, sample length: 3019 +[default0]:Skipping sample id=693218. Maximum sequence length: 2049, sample length: 2259 +[default0]:Skipping sample id=428699. Maximum sequence length: 2049, sample length: 3523 +[default0]:Skipping sample id=406412. Maximum sequence length: 2049, sample length: 3333 +[default0]:Skipping sample id=428762. Maximum sequence length: 2049, sample length: 2199 +[default0]:Skipping sample id=757263. Maximum sequence length: 2049, sample length: 2192 +[default0]:Skipping sample id=975425. Maximum sequence length: 2049, sample length: 2815 +[default0]:Skipping sample id=583620. Maximum sequence length: 2049, sample length: 5508 +[default0]:Skipping sample id=821578. Maximum sequence length: 2049, sample length: 4307 +[default0]:Skipping sample id=792838. Maximum sequence length: 2049, sample length: 2392 +[default0]:Skipping sample id=479476. Maximum sequence length: 2049, sample length: 2082 +[default0]:Skipping sample id=464399. Maximum sequence length: 2049, sample length: 2515 +[default0]:Skipping sample id=851095. Maximum sequence length: 2049, sample length: 2183 +[default0]:Skipping sample id=1354032. Maximum sequence length: 2049, sample length: 2157 +[default0]:Skipping sample id=286450. Maximum sequence length: 2049, sample length: 2220 +[default0]:Skipping sample id=1494484. Maximum sequence length: 2049, sample length: 2161 +[default0]:Skipping sample id=1135612. Maximum sequence length: 2049, sample length: 2801 +[default0]:Skipping sample id=626533. Maximum sequence length: 2049, sample length: 3603 +[default0]:Skipping sample id=234161. Maximum sequence length: 2049, sample length: 2352 +[default0]:Skipping sample id=558630. Maximum sequence length: 2049, sample length: 3414 +[default0]:Skipping sample id=65462. Maximum sequence length: 2049, sample length: 6301 +[default0]:Skipping sample id=1349347. Maximum sequence length: 2049, sample length: 3257 +[default0]:Skipping sample id=1247555. Maximum sequence length: 2049, sample length: 5675 +[default0]:Skipping sample id=1140745. Maximum sequence length: 2049, sample length: 2648 +[default0]:Skipping sample id=1228850. Maximum sequence length: 2049, sample length: 4272 +[default0]:Skipping sample id=1342533. Maximum sequence length: 2049, sample length: 2262 +[default0]:Skipping sample id=1418321. Maximum sequence length: 2049, sample length: 3559 +[default0]:Skipping sample id=1309473. Maximum sequence length: 2049, sample length: 3372 +[default0]:Skipping sample id=858011. Maximum sequence length: 2049, sample length: 2134 +[default0]:Skipping sample id=1086051. Maximum sequence length: 2049, sample length: 2939 +[default0]:Skipping sample id=982346. Maximum sequence length: 2049, sample length: 2450 +[default0]:Skipping sample id=714550. Maximum sequence length: 2049, sample length: 2180 +[default0]:Skipping sample id=216711. Maximum sequence length: 2049, sample length: 2427 +[default0]:Skipping sample id=601817. Maximum sequence length: 2049, sample length: 2062 +[default0]:Skipping sample id=1067220. Maximum sequence length: 2049, sample length: 3669 +[default0]:Skipping sample id=850084. Maximum sequence length: 2049, sample length: 3252 +[default0]:Skipping sample id=1493660. Maximum sequence length: 2049, sample length: 3271 +[default0]:Skipping sample id=172743. Maximum sequence length: 2049, sample length: 2973 +[default0]:Skipping sample id=149107. Maximum sequence length: 2049, sample length: 2801 +[default0]:Skipping sample id=590865. Maximum sequence length: 2049, sample length: 5614 +[default0]:Skipping sample id=383390. Maximum sequence length: 2049, sample length: 2890 +[default0]:Skipping sample id=249422. Maximum sequence length: 2049, sample length: 2188 +[default0]:Skipping sample id=1086703. Maximum sequence length: 2049, sample length: 2801 +[default0]:Skipping sample id=570845. Maximum sequence length: 2049, sample length: 2210 +[default0]:Skipping sample id=675524. Maximum sequence length: 2049, sample length: 2529 +[default0]:Skipping sample id=1267236. Maximum sequence length: 2049, sample length: 2854 +[default0]:Skipping sample id=349087. Maximum sequence length: 2049, sample length: 2238 +[default0]:Skipping sample id=811303. Maximum sequence length: 2049, sample length: 2583 +[default0]:Skipping sample id=1058314. Maximum sequence length: 2049, sample length: 3310 +[default0]:Skipping sample id=77205. Maximum sequence length: 2049, sample length: 2833 +[default0]:Skipping sample id=1265637. Maximum sequence length: 2049, sample length: 2773 +[default0]:Skipping sample id=110172. Maximum sequence length: 2049, sample length: 3503 +[default0]:Skipping sample id=1503237. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=1477823. Maximum sequence length: 2049, sample length: 4837 +[default0]:Skipping sample id=528845. Maximum sequence length: 2049, sample length: 3229 +[default0]:Skipping sample id=701302. Maximum sequence length: 2049, sample length: 3640 +[default0]:Skipping sample id=125042. Maximum sequence length: 2049, sample length: 2270 +[default0]:Skipping sample id=1025823. Maximum sequence length: 2049, sample length: 3880 +[default0]:Skipping sample id=1159387. Maximum sequence length: 2049, sample length: 2372 +[default0]:Skipping sample id=361216. Maximum sequence length: 2049, sample length: 2372 +[default0]:Skipping sample id=141925. Maximum sequence length: 2049, sample length: 2279 +[default0]:Skipping sample id=340133. Maximum sequence length: 2049, sample length: 2458 +[default0]:Skipping sample id=202661. Maximum sequence length: 2049, sample length: 2772 +[default0]:Skipping sample id=249329. Maximum sequence length: 2049, sample length: 2886 +[default0]:Skipping sample id=73564. Maximum sequence length: 2049, sample length: 3937 +[default0]:Skipping sample id=701694. Maximum sequence length: 2049, sample length: 2055 +[default0]:Skipping sample id=415726. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=362356. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=900560. Maximum sequence length: 2049, sample length: 2322 +[default0]:Skipping sample id=533252. Maximum sequence length: 2049, sample length: 2339 +[default0]:Skipping sample id=1238280. Maximum sequence length: 2049, sample length: 2261 +[default0]:Skipping sample id=1342098. Maximum sequence length: 2049, sample length: 8044 +[default0]:Skipping sample id=1360475. Maximum sequence length: 2049, sample length: 2435 +[default0]:Skipping sample id=1067939. Maximum sequence length: 2049, sample length: 4415 +[default0]:Skipping sample id=763301. Maximum sequence length: 2049, sample length: 3430 +[default0]:Skipping sample id=556850. Maximum sequence length: 2049, sample length: 3208 +[default0]:Skipping sample id=1074240. Maximum sequence length: 2049, sample length: 4144 +[default0]:Skipping sample id=404158. Maximum sequence length: 2049, sample length: 2128 +[default0]:Skipping sample id=656368. Maximum sequence length: 2049, sample length: 2723 +[default0]:Skipping sample id=360639. Maximum sequence length: 2049, sample length: 2884 +[default0]:Skipping sample id=1044708. Maximum sequence length: 2049, sample length: 3229 +[default0]:Skipping sample id=530598. Maximum sequence length: 2049, sample length: 2078 +[default0]:Skipping sample id=1340960. Maximum sequence length: 2049, sample length: 3263 +[default0]:Skipping sample id=482503. Maximum sequence length: 2049, sample length: 2921 +[default0]:Skipping sample id=28411. Maximum sequence length: 2049, sample length: 4520 +[default0]:Skipping sample id=52152. Maximum sequence length: 2049, sample length: 4412 +[default0]:Skipping sample id=1143863. Maximum sequence length: 2049, sample length: 2263 +[default0]:Skipping sample id=1182187. Maximum sequence length: 2049, sample length: 4837 +[default0]:Skipping sample id=1498074. Maximum sequence length: 2049, sample length: 2706 +[default0]:Skipping sample id=507065. Maximum sequence length: 2049, sample length: 2066 +[default0]:Skipping sample id=176296. Maximum sequence length: 2049, sample length: 2656 +[default0]:Skipping sample id=138941. Maximum sequence length: 2049, sample length: 2847 +[default0]:Skipping sample id=204549. Maximum sequence length: 2049, sample length: 3043 +[default0]:Skipping sample id=1325473. Maximum sequence length: 2049, sample length: 5312 +[default0]:Skipping sample id=628976. Maximum sequence length: 2049, sample length: 4611 +[default0]:Skipping sample id=454803. Maximum sequence length: 2049, sample length: 3483 +[default0]:Skipping sample id=1404150. Maximum sequence length: 2049, sample length: 2097 +[default0]:Skipping sample id=592666. Maximum sequence length: 2049, sample length: 2831 +[default0]:Skipping sample id=1565086. Maximum sequence length: 2049, sample length: 2753 +[default0]:Skipping sample id=609079. Maximum sequence length: 2049, sample length: 2105 +[default0]:Skipping sample id=604839. Maximum sequence length: 2049, sample length: 2575 +[default0]:Skipping sample id=431482. Maximum sequence length: 2049, sample length: 2662 +[default0]:Skipping sample id=1075895. Maximum sequence length: 2049, sample length: 2516 +[default0]:Skipping sample id=1548039. Maximum sequence length: 2049, sample length: 2486 +[default0]:Skipping sample id=1303441. Maximum sequence length: 2049, sample length: 2250 +[default0]:Skipping sample id=246969. Maximum sequence length: 2049, sample length: 2430 +[default0]:Skipping sample id=871675. Maximum sequence length: 2049, sample length: 2250 +[default0]:Skipping sample id=229340. Maximum sequence length: 2049, sample length: 3073 +[default0]:Skipping sample id=669126. Maximum sequence length: 2049, sample length: 3788 +[default0]:Skipping sample id=871481. Maximum sequence length: 2049, sample length: 2343 +[default0]:Skipping sample id=859873. Maximum sequence length: 2049, sample length: 2533 +[default0]:Skipping sample id=832917. Maximum sequence length: 2049, sample length: 2471 +[default0]:Skipping sample id=1376287. Maximum sequence length: 2049, sample length: 2117 +[default0]:Skipping sample id=1350138. Maximum sequence length: 2049, sample length: 2544 +[default0]:Skipping sample id=1144712. Maximum sequence length: 2049, sample length: 2520 +[default0]:Skipping sample id=543665. Maximum sequence length: 2049, sample length: 3001 +[default0]:Skipping sample id=1173551. Maximum sequence length: 2049, sample length: 4248 +[default0]:Skipping sample id=1509220. Maximum sequence length: 2049, sample length: 2272 +[default0]:Skipping sample id=1238921. Maximum sequence length: 2049, sample length: 2348 +[default0]:Skipping sample id=606095. Maximum sequence length: 2049, sample length: 2908 +[default0]:Skipping sample id=1147013. Maximum sequence length: 2049, sample length: 2511 +[default0]:Skipping sample id=1312067. Maximum sequence length: 2049, sample length: 2790 +[default0]:Skipping sample id=787602. Maximum sequence length: 2049, sample length: 2294 +[default0]:Skipping sample id=463331. Maximum sequence length: 2049, sample length: 3141 +[default0]:Skipping sample id=1417026. Maximum sequence length: 2049, sample length: 2125 +[default0]:Skipping sample id=1105594. Maximum sequence length: 2049, sample length: 3347 +[default0]:Skipping sample id=813047. Maximum sequence length: 2049, sample length: 2876 +[default0]:Skipping sample id=298457. Maximum sequence length: 2049, sample length: 3596 +[default0]:Skipping sample id=128966. Maximum sequence length: 2049, sample length: 3219 +[default0]:Skipping sample id=788034. Maximum sequence length: 2049, sample length: 3269 +[default0]:Skipping sample id=401675. Maximum sequence length: 2049, sample length: 2543 +[default0]:Skipping sample id=793070. Maximum sequence length: 2049, sample length: 2554 +[default0]:Skipping sample id=362159. Maximum sequence length: 2049, sample length: 3329 +[default0]:Skipping sample id=1197957. Maximum sequence length: 2049, sample length: 3113 +[default0]:Skipping sample id=1204365. Maximum sequence length: 2049, sample length: 3735 +[default0]:Skipping sample id=215880. Maximum sequence length: 2049, sample length: 3470 +[default0]:Skipping sample id=919440. Maximum sequence length: 2049, sample length: 2828 +[default0]:Skipping sample id=408133. Maximum sequence length: 2049, sample length: 2504 +[default0]:Skipping sample id=1077494. Maximum sequence length: 2049, sample length: 5992 +[default0]:Skipping sample id=1424519. Maximum sequence length: 2049, sample length: 3389 +[default0]:Skipping sample id=716495. Maximum sequence length: 2049, sample length: 2087 +[default0]:Skipping sample id=533720. Maximum sequence length: 2049, sample length: 2704 +[default0]:Skipping sample id=837475. Maximum sequence length: 2049, sample length: 2253 +[default0]:Skipping sample id=966284. Maximum sequence length: 2049, sample length: 3204 +[default0]:Skipping sample id=198906. Maximum sequence length: 2049, sample length: 2190 +[default0]:Skipping sample id=1515192. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=484003. Maximum sequence length: 2049, sample length: 2877 +[default0]:Skipping sample id=1051491. Maximum sequence length: 2049, sample length: 3984 +[default0]:Skipping sample id=1104176. Maximum sequence length: 2049, sample length: 2771 +[default0]:Skipping sample id=1362785. Maximum sequence length: 2049, sample length: 5310 +[default0]:Skipping sample id=1242913. Maximum sequence length: 2049, sample length: 2787 +[default0]:Skipping sample id=73625. Maximum sequence length: 2049, sample length: 4667 +[default0]:Skipping sample id=960994. Maximum sequence length: 2049, sample length: 2100 +[default0]:Skipping sample id=69279. Maximum sequence length: 2049, sample length: 2887 +[default0]:Skipping sample id=1240133. Maximum sequence length: 2049, sample length: 4725 +[default0]:Skipping sample id=1362530. Maximum sequence length: 2049, sample length: 2996 +[default0]:Skipping sample id=420536. Maximum sequence length: 2049, sample length: 4258 +[default0]:Skipping sample id=1099240. Maximum sequence length: 2049, sample length: 2946 +[default0]:Skipping sample id=1214717. Maximum sequence length: 2049, sample length: 2338 +[default0]:Skipping sample id=17296. Maximum sequence length: 2049, sample length: 4384 +[default0]:Skipping sample id=229833. Maximum sequence length: 2049, sample length: 2629 +[default0]:Skipping sample id=1568209. Maximum sequence length: 2049, sample length: 3688 +[default0]:Skipping sample id=1352476. Maximum sequence length: 2049, sample length: 3070 +[default0]:Skipping sample id=657030. Maximum sequence length: 2049, sample length: 2072 +[default0]:Skipping sample id=1053369. Maximum sequence length: 2049, sample length: 3627 +[default0]:Skipping sample id=1155400. Maximum sequence length: 2049, sample length: 4281 +[default0]:Skipping sample id=849942. Maximum sequence length: 2049, sample length: 2732 +[default0]:Skipping sample id=524772. Maximum sequence length: 2049, sample length: 2451 +[default0]:Skipping sample id=766067. Maximum sequence length: 2049, sample length: 2430 +[default0]:Skipping sample id=484620. Maximum sequence length: 2049, sample length: 2666 +[default0]:Skipping sample id=500240. Maximum sequence length: 2049, sample length: 2354 +[default0]:Skipping sample id=149792. Maximum sequence length: 2049, sample length: 2056 +[default0]:Skipping sample id=312156. Maximum sequence length: 2049, sample length: 2382 +[default0]:Skipping sample id=383256. Maximum sequence length: 2049, sample length: 2536 +[default0]:Skipping sample id=983180. Maximum sequence length: 2049, sample length: 2509 +[default0]:Skipping sample id=1338937. Maximum sequence length: 2049, sample length: 3138 +[default0]:Skipping sample id=42816. Maximum sequence length: 2049, sample length: 3720 +[default0]:Skipping sample id=199584. Maximum sequence length: 2049, sample length: 3920 +[default0]:Skipping sample id=912392. Maximum sequence length: 2049, sample length: 3755 +[default0]:Skipping sample id=719522. Maximum sequence length: 2049, sample length: 3078 +[default0]:Skipping sample id=1179089. Maximum sequence length: 2049, sample length: 2429 +[default0]:Skipping sample id=1108800. Maximum sequence length: 2049, sample length: 2248 +[default0]:Skipping sample id=1477900. Maximum sequence length: 2049, sample length: 3341 +[default0]:Skipping sample id=787955. Maximum sequence length: 2049, sample length: 3506 +[default0]:Skipping sample id=106773. Maximum sequence length: 2049, sample length: 2230 +[default0]:Skipping sample id=1464007. Maximum sequence length: 2049, sample length: 2280 +[default0]:Skipping sample id=76882. Maximum sequence length: 2049, sample length: 3583 +[default0]:Skipping sample id=478237. Maximum sequence length: 2049, sample length: 4885 +[default0]:Skipping sample id=1337710. Maximum sequence length: 2049, sample length: 2104 +[default0]:Skipping sample id=1420350. Maximum sequence length: 2049, sample length: 2109 +[default0]:Skipping sample id=1360879. Maximum sequence length: 2049, sample length: 4307 +[default0]:Skipping sample id=1326543. Maximum sequence length: 2049, sample length: 2727 +[default0]:Skipping sample id=1425787. Maximum sequence length: 2049, sample length: 3151 +[default0]:Skipping sample id=410797. Maximum sequence length: 2049, sample length: 3252 +[default0]:Skipping sample id=628986. Maximum sequence length: 2049, sample length: 2590 +[default0]:Skipping sample id=322900. Maximum sequence length: 2049, sample length: 3280 +[default0]:Skipping sample id=1147520. Maximum sequence length: 2049, sample length: 2523 +[default0]:Skipping sample id=301134. Maximum sequence length: 2049, sample length: 2071 +[default0]:Skipping sample id=1190167. Maximum sequence length: 2049, sample length: 2121 +[default0]:Skipping sample id=432083. Maximum sequence length: 2049, sample length: 2109 +[default0]:Skipping sample id=130963. Maximum sequence length: 2049, sample length: 2104 +[default0]:Skipping sample id=1516997. Maximum sequence length: 2049, sample length: 3148 +[default0]:Skipping sample id=545048. Maximum sequence length: 2049, sample length: 2621 +[default0]:Skipping sample id=1518463. Maximum sequence length: 2049, sample length: 2663 +[default0]:Skipping sample id=1202684. Maximum sequence length: 2049, sample length: 3167 +[default0]:Skipping sample id=469892. Maximum sequence length: 2049, sample length: 3113 +[default0]:Skipping sample id=849380. Maximum sequence length: 2049, sample length: 2392 +[default0]:Skipping sample id=447925. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=1377213. Maximum sequence length: 2049, sample length: 2819 +[default0]:Skipping sample id=1412391. Maximum sequence length: 2049, sample length: 2552 +[default0]:Skipping sample id=955969. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=1034199. Maximum sequence length: 2049, sample length: 2737 +[default0]:Skipping sample id=1539479. Maximum sequence length: 2049, sample length: 3010 +[default0]:Skipping sample id=1342252. Maximum sequence length: 2049, sample length: 2824 +[default0]:Skipping sample id=1349940. Maximum sequence length: 2049, sample length: 3918 +[default0]:Skipping sample id=1183760. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=1412730. Maximum sequence length: 2049, sample length: 3026 +[default0]:Skipping sample id=265257. Maximum sequence length: 2049, sample length: 2995 +[default0]:Skipping sample id=437278. Maximum sequence length: 2049, sample length: 4746 +[default0]:Skipping sample id=105595. Maximum sequence length: 2049, sample length: 2090 +[default0]:Skipping sample id=473322. Maximum sequence length: 2049, sample length: 3268 +[default0]:Skipping sample id=662091. Maximum sequence length: 2049, sample length: 3300 +[default0]:Skipping sample id=978215. Maximum sequence length: 2049, sample length: 2783 +[default0]:Skipping sample id=1315040. Maximum sequence length: 2049, sample length: 2956 +[default0]:Skipping sample id=111822. Maximum sequence length: 2049, sample length: 2577 +[default0]:Skipping sample id=485318. Maximum sequence length: 2049, sample length: 2875 +[default0]:Skipping sample id=1230375. Maximum sequence length: 2049, sample length: 4311 +[default0]:Skipping sample id=1107422. Maximum sequence length: 2049, sample length: 2804 +[default0]:Skipping sample id=286909. Maximum sequence length: 2049, sample length: 2329 +[default0]:Skipping sample id=1238826. Maximum sequence length: 2049, sample length: 3435 +[default0]:Skipping sample id=1129293. Maximum sequence length: 2049, sample length: 2693 +[default0]:Skipping sample id=1474545. Maximum sequence length: 2049, sample length: 5899 +[default0]:Skipping sample id=488676. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=564541. Maximum sequence length: 2049, sample length: 2059 +[default0]:Skipping sample id=1239507. Maximum sequence length: 2049, sample length: 3150 +[default0]:Skipping sample id=978275. Maximum sequence length: 2049, sample length: 2505 +[default0]:Skipping sample id=124125. Maximum sequence length: 2049, sample length: 5100 +[default0]:Skipping sample id=151211. Maximum sequence length: 2049, sample length: 2714 +[default0]:Skipping sample id=937760. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=389197. Maximum sequence length: 2049, sample length: 2513 +[default0]:Skipping sample id=1022767. Maximum sequence length: 2049, sample length: 3581 +[default0]:Skipping sample id=1029587. Maximum sequence length: 2049, sample length: 2685 +[default0]:Skipping sample id=1504902. Maximum sequence length: 2049, sample length: 2613 +[default0]:Skipping sample id=1146266. Maximum sequence length: 2049, sample length: 3573 +[default0]:Skipping sample id=485944. Maximum sequence length: 2049, sample length: 6566 +[default0]:Skipping sample id=1476498. Maximum sequence length: 2049, sample length: 2097 +[default0]:Skipping sample id=925371. Maximum sequence length: 2049, sample length: 3036 +[default0]:Skipping sample id=569252. Maximum sequence length: 2049, sample length: 3003 +[default0]:Skipping sample id=769292. Maximum sequence length: 2049, sample length: 5167 +[default0]:Skipping sample id=476874. Maximum sequence length: 2049, sample length: 3188 +[default0]:Skipping sample id=1413349. Maximum sequence length: 2049, sample length: 2803 +[default0]:Skipping sample id=79037. Maximum sequence length: 2049, sample length: 3418 +[default0]:Skipping sample id=300432. Maximum sequence length: 2049, sample length: 3495 +[default0]:Skipping sample id=729516. Maximum sequence length: 2049, sample length: 5143 +[default0]:Skipping sample id=965980. Maximum sequence length: 2049, sample length: 2865 +[default0]:Skipping sample id=157396. Maximum sequence length: 2049, sample length: 2429 +[default0]:Skipping sample id=778692. Maximum sequence length: 2049, sample length: 4119 +[default0]:Skipping sample id=1433895. Maximum sequence length: 2049, sample length: 5244 +[default0]:Skipping sample id=218429. Maximum sequence length: 2049, sample length: 3297 +[default0]:Skipping sample id=234203. Maximum sequence length: 2049, sample length: 2625 +[default0]:Skipping sample id=779767. Maximum sequence length: 2049, sample length: 3190 +[default0]:Skipping sample id=968660. Maximum sequence length: 2049, sample length: 2089 +[default0]:Skipping sample id=724906. Maximum sequence length: 2049, sample length: 2423 +[default0]:Skipping sample id=204245. Maximum sequence length: 2049, sample length: 2230 +[default0]:Skipping sample id=231788. Maximum sequence length: 2049, sample length: 2665 +[default0]:Skipping sample id=485660. Maximum sequence length: 2049, sample length: 3308 +[default0]:Skipping sample id=742512. Maximum sequence length: 2049, sample length: 2751 +[default0]:Skipping sample id=169074. Maximum sequence length: 2049, sample length: 2846 +[default0]:Skipping sample id=717920. Maximum sequence length: 2049, sample length: 2958 +[default0]:Skipping sample id=1366884. Maximum sequence length: 2049, sample length: 2072 +[default0]:Skipping sample id=48868. Maximum sequence length: 2049, sample length: 5844 +[default0]:Skipping sample id=581069. Maximum sequence length: 2049, sample length: 2325 +[default0]:Skipping sample id=1139018. Maximum sequence length: 2049, sample length: 2743 +[default0]:Skipping sample id=1417652. Maximum sequence length: 2049, sample length: 2686 +[default0]:Skipping sample id=238812. Maximum sequence length: 2049, sample length: 2269 +[default0]:Skipping sample id=1016153. Maximum sequence length: 2049, sample length: 4011 +[default0]:Skipping sample id=245617. Maximum sequence length: 2049, sample length: 2557 +[default0]:Skipping sample id=1266177. Maximum sequence length: 2049, sample length: 2625 +[default0]:Skipping sample id=758823. Maximum sequence length: 2049, sample length: 3756 +[default0]:Skipping sample id=864941. Maximum sequence length: 2049, sample length: 2303 +[default0]:Skipping sample id=700363. Maximum sequence length: 2049, sample length: 2722 +[default0]:Skipping sample id=229992. Maximum sequence length: 2049, sample length: 4086 +[default0]:Skipping sample id=178156. Maximum sequence length: 2049, sample length: 2481 +[default0]:Skipping sample id=399436. Maximum sequence length: 2049, sample length: 4085 +[default0]:Skipping sample id=849734. Maximum sequence length: 2049, sample length: 3211 +[default0]:Skipping sample id=162167. Maximum sequence length: 2049, sample length: 4520 +[default0]:Skipping sample id=323918. Maximum sequence length: 2049, sample length: 5090 +[default0]:Skipping sample id=1179383. Maximum sequence length: 2049, sample length: 3498 +[default0]:Skipping sample id=1387429. Maximum sequence length: 2049, sample length: 3417 +[default0]:Skipping sample id=505235. Maximum sequence length: 2049, sample length: 2187 +[default0]:Skipping sample id=1231805. Maximum sequence length: 2049, sample length: 3502 +[default0]:Skipping sample id=215352. Maximum sequence length: 2049, sample length: 2161 +[default0]:Skipping sample id=617657. Maximum sequence length: 2049, sample length: 2398 +[default0]:Skipping sample id=318126. Maximum sequence length: 2049, sample length: 2263 +[default0]:Skipping sample id=479338. Maximum sequence length: 2049, sample length: 2382 +[default0]:Skipping sample id=615146. Maximum sequence length: 2049, sample length: 2289 +[default0]:Skipping sample id=1098248. Maximum sequence length: 2049, sample length: 2879 +[default0]:Skipping sample id=1249996. Maximum sequence length: 2049, sample length: 3332 +[default0]:Skipping sample id=1518558. Maximum sequence length: 2049, sample length: 2123 +[default0]:Skipping sample id=40885. Maximum sequence length: 2049, sample length: 4417 +[default0]:Skipping sample id=1262504. Maximum sequence length: 2049, sample length: 3338 +[default0]:Skipping sample id=44020. Maximum sequence length: 2049, sample length: 3706 +[default0]:Skipping sample id=384511. Maximum sequence length: 2049, sample length: 2392 +[default0]:Skipping sample id=1480724. Maximum sequence length: 2049, sample length: 2438 +[default0]:Skipping sample id=334432. Maximum sequence length: 2049, sample length: 3560 +[default0]:Skipping sample id=948773. Maximum sequence length: 2049, sample length: 2493 +[default0]:Skipping sample id=446702. Maximum sequence length: 2049, sample length: 2835 +[default0]:Skipping sample id=757435. Maximum sequence length: 2049, sample length: 2336 +[default0]:Skipping sample id=1171025. Maximum sequence length: 2049, sample length: 2887 +[default0]:Skipping sample id=1112113. Maximum sequence length: 2049, sample length: 2073 +[default0]:Skipping sample id=1562735. Maximum sequence length: 2049, sample length: 2958 +[default0]:Skipping sample id=1535287. Maximum sequence length: 2049, sample length: 3392 +[default0]:Skipping sample id=1224928. Maximum sequence length: 2049, sample length: 2476 +[default0]:Skipping sample id=274699. Maximum sequence length: 2049, sample length: 2322 +[default0]:Skipping sample id=72565. Maximum sequence length: 2049, sample length: 3566 +[default0]:Skipping sample id=970085. Maximum sequence length: 2049, sample length: 2346 +[default0]:Skipping sample id=790740. Maximum sequence length: 2049, sample length: 2373 +[default0]:Skipping sample id=1270466. Maximum sequence length: 2049, sample length: 2492 +[default0]:Skipping sample id=1359029. Maximum sequence length: 2049, sample length: 3975 +[default0]:Skipping sample id=300728. Maximum sequence length: 2049, sample length: 2391 +[default0]:Skipping sample id=689592. Maximum sequence length: 2049, sample length: 2147 +[default0]:Skipping sample id=1255500. Maximum sequence length: 2049, sample length: 3033 +[default0]:Skipping sample id=803720. Maximum sequence length: 2049, sample length: 2092 +[default0]:Skipping sample id=726175. Maximum sequence length: 2049, sample length: 2573 +[default0]:Skipping sample id=984965. Maximum sequence length: 2049, sample length: 3490 +[default0]:Skipping sample id=602708. Maximum sequence length: 2049, sample length: 2242 +[default0]:Skipping sample id=528024. Maximum sequence length: 2049, sample length: 2824 +[default0]:Skipping sample id=910477. Maximum sequence length: 2049, sample length: 2627 +[default0]:Skipping sample id=198343. Maximum sequence length: 2049, sample length: 2933 +[default0]:Skipping sample id=636860. Maximum sequence length: 2049, sample length: 2763 +[default0]:Skipping sample id=1430789. Maximum sequence length: 2049, sample length: 2117 +[default0]:Skipping sample id=1235520. Maximum sequence length: 2049, sample length: 2943 +[default0]:Skipping sample id=283899. Maximum sequence length: 2049, sample length: 4604 +[default0]:Skipping sample id=47132. Maximum sequence length: 2049, sample length: 4087 +[default0]:Skipping sample id=393629. Maximum sequence length: 2049, sample length: 6281 +[default0]:Skipping sample id=1065950. Maximum sequence length: 2049, sample length: 2437 +[default0]:Skipping sample id=436271. Maximum sequence length: 2049, sample length: 2083 +[default0]:Skipping sample id=1469763. Maximum sequence length: 2049, sample length: 4042 +[default0]:Skipping sample id=291771. Maximum sequence length: 2049, sample length: 2922 +[default0]:Skipping sample id=711720. Maximum sequence length: 2049, sample length: 2742 +[default0]:Skipping sample id=426574. Maximum sequence length: 2049, sample length: 3053 +[default0]:Skipping sample id=296113. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=1156934. Maximum sequence length: 2049, sample length: 2230 +[default0]:Skipping sample id=281037. Maximum sequence length: 2049, sample length: 2203 +[default0]:Skipping sample id=812697. Maximum sequence length: 2049, sample length: 2195 +[default0]:Skipping sample id=62227. Maximum sequence length: 2049, sample length: 2422 +[default0]:Skipping sample id=1313161. Maximum sequence length: 2049, sample length: 2854 +[default0]:Skipping sample id=1212265. Maximum sequence length: 2049, sample length: 5516 +[default0]:Skipping sample id=535858. Maximum sequence length: 2049, sample length: 2628 +[default0]:Skipping sample id=277167. Maximum sequence length: 2049, sample length: 4144 +[default0]:Skipping sample id=13400. Maximum sequence length: 2049, sample length: 3001 +[default0]:Skipping sample id=262808. Maximum sequence length: 2049, sample length: 2354 +[default0]:Skipping sample id=456052. Maximum sequence length: 2049, sample length: 4971 +[default0]:Skipping sample id=329754. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=430536. Maximum sequence length: 2049, sample length: 3284 +[default0]:Skipping sample id=664771. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=1127578. Maximum sequence length: 2049, sample length: 2097 +[default0]:Skipping sample id=1304894. Maximum sequence length: 2049, sample length: 3897 +[default0]:Skipping sample id=1083531. Maximum sequence length: 2049, sample length: 2135 +[default0]:Skipping sample id=430812. Maximum sequence length: 2049, sample length: 2092 +[default0]:Skipping sample id=1526293. Maximum sequence length: 2049, sample length: 2361 +[default0]:Skipping sample id=296886. Maximum sequence length: 2049, sample length: 3089 +[default0]:Skipping sample id=1543205. Maximum sequence length: 2049, sample length: 4372 +[default0]:Skipping sample id=601392. Maximum sequence length: 2049, sample length: 3398 +[default0]:Skipping sample id=390580. Maximum sequence length: 2049, sample length: 3349 +[default0]:Skipping sample id=1168156. Maximum sequence length: 2049, sample length: 5241 +[default0]:Skipping sample id=612980. Maximum sequence length: 2049, sample length: 2181 +[default0]:Skipping sample id=1319948. Maximum sequence length: 2049, sample length: 3252 +[default0]:Skipping sample id=818226. Maximum sequence length: 2049, sample length: 2114 +[default0]:Skipping sample id=529379. Maximum sequence length: 2049, sample length: 4225 +[default0]:Skipping sample id=735770. Maximum sequence length: 2049, sample length: 3469 +[default0]:Skipping sample id=819758. Maximum sequence length: 2049, sample length: 2595 +[default0]:Skipping sample id=241200. Maximum sequence length: 2049, sample length: 2980 +[default0]:Skipping sample id=547460. Maximum sequence length: 2049, sample length: 3115 +[default0]:Skipping sample id=1515667. Maximum sequence length: 2049, sample length: 4129 +[default0]:Skipping sample id=475100. Maximum sequence length: 2049, sample length: 2248 +[default0]:Skipping sample id=693170. Maximum sequence length: 2049, sample length: 2770 +[default0]:Skipping sample id=891797. Maximum sequence length: 2049, sample length: 2260 +[default0]:Skipping sample id=855840. Maximum sequence length: 2049, sample length: 2828 +[default0]:Skipping sample id=1248325. Maximum sequence length: 2049, sample length: 2952 +[default0]:Skipping sample id=1132978. Maximum sequence length: 2049, sample length: 3969 +[default0]:Skipping sample id=1101696. Maximum sequence length: 2049, sample length: 2219 +[default0]:Skipping sample id=598984. Maximum sequence length: 2049, sample length: 3781 +[default0]:Skipping sample id=288193. Maximum sequence length: 2049, sample length: 4550 +[default0]:Skipping sample id=1480850. Maximum sequence length: 2049, sample length: 2641 +[default0]:Skipping sample id=1098310. Maximum sequence length: 2049, sample length: 4280 +[default0]:Skipping sample id=1220616. Maximum sequence length: 2049, sample length: 2308 +[default0]:Skipping sample id=1450260. Maximum sequence length: 2049, sample length: 4164 +[default0]:Skipping sample id=913119. Maximum sequence length: 2049, sample length: 2287 +[default0]:Skipping sample id=1170851. Maximum sequence length: 2049, sample length: 3376 +[default0]:Skipping sample id=904739. Maximum sequence length: 2049, sample length: 2246 +[default0]:Skipping sample id=375011. Maximum sequence length: 2049, sample length: 4999 +[default0]:Skipping sample id=317232. Maximum sequence length: 2049, sample length: 2716 +[default0]:Skipping sample id=1492821. Maximum sequence length: 2049, sample length: 3474 +[default0]:Skipping sample id=889750. Maximum sequence length: 2049, sample length: 2057 +[default0]:Skipping sample id=25835. Maximum sequence length: 2049, sample length: 2503 +[default0]:Skipping sample id=1551961. Maximum sequence length: 2049, sample length: 3064 +[default0]:Skipping sample id=356263. Maximum sequence length: 2049, sample length: 3475 +[default0]:Skipping sample id=579851. Maximum sequence length: 2049, sample length: 3373 +[default0]:Skipping sample id=1212053. Maximum sequence length: 2049, sample length: 5413 +[default0]:Skipping sample id=161572. Maximum sequence length: 2049, sample length: 2104 +[default0]:Skipping sample id=1275497. Maximum sequence length: 2049, sample length: 3750 +[default0]:Skipping sample id=1346468. Maximum sequence length: 2049, sample length: 2106 +[default0]:Skipping sample id=610208. Maximum sequence length: 2049, sample length: 2355 +[default0]:Skipping sample id=708255. Maximum sequence length: 2049, sample length: 3182 +[default0]:Skipping sample id=453546. Maximum sequence length: 2049, sample length: 2397 +[default0]:Skipping sample id=648726. Maximum sequence length: 2049, sample length: 2080 +[default0]:Skipping sample id=1432977. Maximum sequence length: 2049, sample length: 2906 +[default0]:Skipping sample id=156525. Maximum sequence length: 2049, sample length: 2765 +[default0]:Skipping sample id=1506581. Maximum sequence length: 2049, sample length: 2682 +[default0]:Skipping sample id=49021. Maximum sequence length: 2049, sample length: 2128 +[default0]:Skipping sample id=717947. Maximum sequence length: 2049, sample length: 2244 +[default0]:Skipping sample id=236524. Maximum sequence length: 2049, sample length: 2411 +[default0]:Skipping sample id=643530. Maximum sequence length: 2049, sample length: 3717 +[default0]:Skipping sample id=175287. Maximum sequence length: 2049, sample length: 2284 +[default0]:Skipping sample id=1126065. Maximum sequence length: 2049, sample length: 2824 +[default0]:Skipping sample id=159636. Maximum sequence length: 2049, sample length: 2125 +[default0]:Skipping sample id=652463. Maximum sequence length: 2049, sample length: 3354 +[default0]:Skipping sample id=416065. Maximum sequence length: 2049, sample length: 2299 +[default0]:Skipping sample id=555734. Maximum sequence length: 2049, sample length: 2092 +[default0]:Skipping sample id=1484580. Maximum sequence length: 2049, sample length: 3350 +[default0]:Skipping sample id=1192952. Maximum sequence length: 2049, sample length: 2402 +[default0]:Skipping sample id=964564. Maximum sequence length: 2049, sample length: 2345 +[default0]:Skipping sample id=1242622. Maximum sequence length: 2049, sample length: 3407 +[default0]:Skipping sample id=816976. Maximum sequence length: 2049, sample length: 2178 +[default0]:Skipping sample id=1529558. Maximum sequence length: 2049, sample length: 2875 +[default0]:Skipping sample id=138159. Maximum sequence length: 2049, sample length: 2301 +[default0]:Skipping sample id=505642. Maximum sequence length: 2049, sample length: 3643 +[default0]:Skipping sample id=1276453. Maximum sequence length: 2049, sample length: 4009 +[default0]:Skipping sample id=1302280. Maximum sequence length: 2049, sample length: 2145 +[default0]:Skipping sample id=1162164. Maximum sequence length: 2049, sample length: 3034 +[default0]:Skipping sample id=1350869. Maximum sequence length: 2049, sample length: 2456 +[default0]:Skipping sample id=1539010. Maximum sequence length: 2049, sample length: 3373 +[default0]:Skipping sample id=258422. Maximum sequence length: 2049, sample length: 2209 +[default0]:Skipping sample id=1184941. Maximum sequence length: 2049, sample length: 2272 +[default0]:Skipping sample id=1270515. Maximum sequence length: 2049, sample length: 3954 +[default0]:Skipping sample id=37042. Maximum sequence length: 2049, sample length: 2254 +[default0]:Skipping sample id=134601. Maximum sequence length: 2049, sample length: 2777 +[default0]:Skipping sample id=1029350. Maximum sequence length: 2049, sample length: 2146 +[default0]:Skipping sample id=919363. Maximum sequence length: 2049, sample length: 2090 +[default0]:Skipping sample id=650395. Maximum sequence length: 2049, sample length: 3121 +[default0]:Skipping sample id=1492223. Maximum sequence length: 2049, sample length: 2888 +[default0]:Skipping sample id=1030843. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=845306. Maximum sequence length: 2049, sample length: 2235 +[default0]:Skipping sample id=1255759. Maximum sequence length: 2049, sample length: 2472 +[default0]:Skipping sample id=172314. Maximum sequence length: 2049, sample length: 4139 +[default0]:Skipping sample id=1072869. Maximum sequence length: 2049, sample length: 2057 +[default0]:Skipping sample id=780686. Maximum sequence length: 2049, sample length: 2812 +[default0]:Skipping sample id=750027. Maximum sequence length: 2049, sample length: 2150 +[default0]:Skipping sample id=460329. Maximum sequence length: 2049, sample length: 2527 +[default0]:Skipping sample id=150306. Maximum sequence length: 2049, sample length: 3150 +[default0]:Skipping sample id=596776. Maximum sequence length: 2049, sample length: 2079 +[default0]:Skipping sample id=697053. Maximum sequence length: 2049, sample length: 2341 +[default0]:Skipping sample id=91416. Maximum sequence length: 2049, sample length: 2697 +[default0]:Skipping sample id=121400. Maximum sequence length: 2049, sample length: 4904 +[default0]:Skipping sample id=236171. Maximum sequence length: 2049, sample length: 2528 +[default0]:Skipping sample id=521310. Maximum sequence length: 2049, sample length: 3164 +[default0]:Skipping sample id=1082572. Maximum sequence length: 2049, sample length: 3426 +[default0]:Skipping sample id=385010. Maximum sequence length: 2049, sample length: 3680 +[default0]:Skipping sample id=577727. Maximum sequence length: 2049, sample length: 2359 +[default0]:Skipping sample id=718285. Maximum sequence length: 2049, sample length: 2313 +[default0]:Skipping sample id=319937. Maximum sequence length: 2049, sample length: 2136 +[default0]:Skipping sample id=28664. Maximum sequence length: 2049, sample length: 2775 +[default0]:Skipping sample id=1031210. Maximum sequence length: 2049, sample length: 3578 +[default0]:Skipping sample id=507335. Maximum sequence length: 2049, sample length: 3829 +[default0]:Skipping sample id=1443460. Maximum sequence length: 2049, sample length: 3496 +[default0]:Skipping sample id=682133. Maximum sequence length: 2049, sample length: 2452 +[default0]:Skipping sample id=321879. Maximum sequence length: 2049, sample length: 2061 +[default0]:Skipping sample id=1266963. Maximum sequence length: 2049, sample length: 2637 +[default0]:Skipping sample id=406694. Maximum sequence length: 2049, sample length: 3718 +[default0]:Skipping sample id=1433574. Maximum sequence length: 2049, sample length: 2383 +[default0]:Skipping sample id=598252. Maximum sequence length: 2049, sample length: 4249 +[default0]:Skipping sample id=798620. Maximum sequence length: 2049, sample length: 3485 +[default0]:Skipping sample id=897423. Maximum sequence length: 2049, sample length: 2547 +[default0]:Skipping sample id=22789. Maximum sequence length: 2049, sample length: 2584 +[default0]:Skipping sample id=469972. Maximum sequence length: 2049, sample length: 5194 +[default0]:Skipping sample id=1075072. Maximum sequence length: 2049, sample length: 2842 +[default0]:Skipping sample id=434891. Maximum sequence length: 2049, sample length: 3530 +[default0]:Skipping sample id=1145896. Maximum sequence length: 2049, sample length: 2388 +[default0]:Skipping sample id=915486. Maximum sequence length: 2049, sample length: 2827 +[default0]:Skipping sample id=244126. Maximum sequence length: 2049, sample length: 2476 +[default0]:Skipping sample id=1255810. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=688291. Maximum sequence length: 2049, sample length: 2656 +[default0]:Skipping sample id=1501575. Maximum sequence length: 2049, sample length: 2502 +[default0]:Skipping sample id=1334300. Maximum sequence length: 2049, sample length: 2288 +[default0]:Skipping sample id=222149. Maximum sequence length: 2049, sample length: 4020 +[default0]:Skipping sample id=1438497. Maximum sequence length: 2049, sample length: 2281 +[default0]:Skipping sample id=537742. Maximum sequence length: 2049, sample length: 2163 +[default0]:Skipping sample id=508462. Maximum sequence length: 2049, sample length: 3876 +[default0]:Skipping sample id=1051936. Maximum sequence length: 2049, sample length: 2523 +[default0]:Skipping sample id=984114. Maximum sequence length: 2049, sample length: 2535 +[default0]:Skipping sample id=884740. Maximum sequence length: 2049, sample length: 2214 +[default0]:Skipping sample id=829135. Maximum sequence length: 2049, sample length: 2728 +[default0]:Skipping sample id=251791. Maximum sequence length: 2049, sample length: 2141 +[default0]:Skipping sample id=573794. Maximum sequence length: 2049, sample length: 2329 +[default0]:Skipping sample id=1408558. Maximum sequence length: 2049, sample length: 2492 +[default0]:Skipping sample id=280910. Maximum sequence length: 2049, sample length: 2091 +[default0]:Skipping sample id=265991. Maximum sequence length: 2049, sample length: 3545 +[default0]:Skipping sample id=679386. Maximum sequence length: 2049, sample length: 3142 +[default0]:Skipping sample id=727400. Maximum sequence length: 2049, sample length: 3536 +[default0]:Skipping sample id=615161. Maximum sequence length: 2049, sample length: 5006 +[default0]:Skipping sample id=1025006. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=992226. Maximum sequence length: 2049, sample length: 2569 +[default0]:Skipping sample id=1070859. Maximum sequence length: 2049, sample length: 3616 +[default0]:Skipping sample id=724115. Maximum sequence length: 2049, sample length: 2842 +[default0]:Skipping sample id=329678. Maximum sequence length: 2049, sample length: 3952 +[default0]:Skipping sample id=1148851. Maximum sequence length: 2049, sample length: 2983 +[default0]:Skipping sample id=1102217. Maximum sequence length: 2049, sample length: 4923 +[default0]:Skipping sample id=452379. Maximum sequence length: 2049, sample length: 3557 +[default0]:Skipping sample id=406161. Maximum sequence length: 2049, sample length: 4094 +[default0]:Skipping sample id=249560. Maximum sequence length: 2049, sample length: 2104 +[default0]:Skipping sample id=1125821. Maximum sequence length: 2049, sample length: 4024 +[default0]:Skipping sample id=919644. Maximum sequence length: 2049, sample length: 2116 +[default0]:Skipping sample id=1495745. Maximum sequence length: 2049, sample length: 3227 +[default0]:Skipping sample id=106950. Maximum sequence length: 2049, sample length: 2472 +[default0]:Skipping sample id=404536. Maximum sequence length: 2049, sample length: 3286 +[default0]:Skipping sample id=749316. Maximum sequence length: 2049, sample length: 2321 +[default0]:Skipping sample id=42921. Maximum sequence length: 2049, sample length: 3021 +[default0]:Skipping sample id=627089. Maximum sequence length: 2049, sample length: 2304 +[default0]:Skipping sample id=430000. Maximum sequence length: 2049, sample length: 2706 +[default0]:Skipping sample id=370545. Maximum sequence length: 2049, sample length: 3277 +[default0]:Skipping sample id=1094274. Maximum sequence length: 2049, sample length: 2655 +[default0]:Skipping sample id=295494. Maximum sequence length: 2049, sample length: 2652 +[default0]:Skipping sample id=1573662. Maximum sequence length: 2049, sample length: 2210 +[default0]:Skipping sample id=13427. Maximum sequence length: 2049, sample length: 3573 +[default0]:Skipping sample id=430092. Maximum sequence length: 2049, sample length: 2611 +[default0]:Skipping sample id=730834. Maximum sequence length: 2049, sample length: 3360 +[default0]:Skipping sample id=772237. Maximum sequence length: 2049, sample length: 2305 +[default0]:Skipping sample id=695015. Maximum sequence length: 2049, sample length: 3468 +[default0]:Skipping sample id=747529. Maximum sequence length: 2049, sample length: 2521 +[default0]:Skipping sample id=1064032. Maximum sequence length: 2049, sample length: 3990 +[default0]:Skipping sample id=375722. Maximum sequence length: 2049, sample length: 2057 +[default0]:Skipping sample id=766202. Maximum sequence length: 2049, sample length: 2691 +[default0]:Skipping sample id=913128. Maximum sequence length: 2049, sample length: 3145 +[default0]:Skipping sample id=1449940. Maximum sequence length: 2049, sample length: 2107 +[default0]:Skipping sample id=593745. Maximum sequence length: 2049, sample length: 2714 +[default0]:Skipping sample id=1081731. Maximum sequence length: 2049, sample length: 5052 +[default0]:Skipping sample id=1436635. Maximum sequence length: 2049, sample length: 4066 +[default0]:Skipping sample id=582372. Maximum sequence length: 2049, sample length: 3421 +[default0]:Skipping sample id=1353393. Maximum sequence length: 2049, sample length: 4308 +[default0]:Skipping sample id=586471. Maximum sequence length: 2049, sample length: 2829 +[default0]:Skipping sample id=1286295. Maximum sequence length: 2049, sample length: 2875 +[default0]:Skipping sample id=1159551. Maximum sequence length: 2049, sample length: 3911 +[default0]:Skipping sample id=268521. Maximum sequence length: 2049, sample length: 2203 +[default0]:Skipping sample id=841324. Maximum sequence length: 2049, sample length: 4192 +[default0]:Skipping sample id=47930. Maximum sequence length: 2049, sample length: 2872 +[default0]:Skipping sample id=404701. Maximum sequence length: 2049, sample length: 2167 +[default0]:Skipping sample id=410645. Maximum sequence length: 2049, sample length: 2240 +[default0]:Skipping sample id=434889. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=55386. Maximum sequence length: 2049, sample length: 2346 +[default0]:Skipping sample id=662645. Maximum sequence length: 2049, sample length: 2846 +[default0]:Skipping sample id=625880. Maximum sequence length: 2049, sample length: 3038 +[default0]:Skipping sample id=609072. Maximum sequence length: 2049, sample length: 3381 +[default0]:Skipping sample id=715093. Maximum sequence length: 2049, sample length: 2986 +[default0]:Skipping sample id=1451561. Maximum sequence length: 2049, sample length: 2446 +[default0]:Skipping sample id=57775. Maximum sequence length: 2049, sample length: 2435 +[default0]:Skipping sample id=338914. Maximum sequence length: 2049, sample length: 3050 +[default0]:Skipping sample id=1086481. Maximum sequence length: 2049, sample length: 3789 +[default0]:Skipping sample id=612044. Maximum sequence length: 2049, sample length: 2791 +[default0]:Skipping sample id=1177085. Maximum sequence length: 2049, sample length: 2241 +[default0]:Skipping sample id=1281128. Maximum sequence length: 2049, sample length: 2281 +[default0]:Skipping sample id=485379. Maximum sequence length: 2049, sample length: 2317 +[default0]:Skipping sample id=525692. Maximum sequence length: 2049, sample length: 2545 +[default0]:Skipping sample id=1230232. Maximum sequence length: 2049, sample length: 3084 +[default0]:Skipping sample id=1085811. Maximum sequence length: 2049, sample length: 3431 +[default0]:Skipping sample id=461826. Maximum sequence length: 2049, sample length: 2524 +[default0]:Skipping sample id=965191. Maximum sequence length: 2049, sample length: 4044 +[default0]:Skipping sample id=1405230. Maximum sequence length: 2049, sample length: 2366 +[default0]:Skipping sample id=507230. Maximum sequence length: 2049, sample length: 2656 +[default0]:Skipping sample id=921907. Maximum sequence length: 2049, sample length: 4473 +[default0]:Skipping sample id=1172791. Maximum sequence length: 2049, sample length: 4922 +[default0]:Skipping sample id=653992. Maximum sequence length: 2049, sample length: 3339 +[default0]:Skipping sample id=1031321. Maximum sequence length: 2049, sample length: 2799 +[default0]:Skipping sample id=1182843. Maximum sequence length: 2049, sample length: 2777 +[default0]:Skipping sample id=75993. Maximum sequence length: 2049, sample length: 2559 +[default0]:Skipping sample id=22156. Maximum sequence length: 2049, sample length: 2496 +[default0]:Skipping sample id=411515. Maximum sequence length: 2049, sample length: 3187 +[default0]:Skipping sample id=910521. Maximum sequence length: 2049, sample length: 2431 +[default0]:Skipping sample id=697125. Maximum sequence length: 2049, sample length: 2219 +[default0]:Skipping sample id=349130. Maximum sequence length: 2049, sample length: 3620 +[default0]:Skipping sample id=416495. Maximum sequence length: 2049, sample length: 2687 +[default0]:Skipping sample id=456462. Maximum sequence length: 2049, sample length: 2969 +[default0]:Skipping sample id=981232. Maximum sequence length: 2049, sample length: 3694 +[default0]:Skipping sample id=16497. Maximum sequence length: 2049, sample length: 3170 +[default0]:Skipping sample id=355649. Maximum sequence length: 2049, sample length: 2380 +[default0]:Skipping sample id=504192. Maximum sequence length: 2049, sample length: 2127 +[default0]:Skipping sample id=1461050. Maximum sequence length: 2049, sample length: 2434 +[default0]:Skipping sample id=1049298. Maximum sequence length: 2049, sample length: 3001 +[default0]:Skipping sample id=1114482. Maximum sequence length: 2049, sample length: 2267 +[default0]:Skipping sample id=1228229. Maximum sequence length: 2049, sample length: 5356 +[default0]:Skipping sample id=1548676. Maximum sequence length: 2049, sample length: 2803 +[default0]:Skipping sample id=590912. Maximum sequence length: 2049, sample length: 3185 +[default0]:Skipping sample id=903901. Maximum sequence length: 2049, sample length: 2378 +[default0]:Skipping sample id=72062. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=968489. Maximum sequence length: 2049, sample length: 2760 +[default0]:Skipping sample id=585945. Maximum sequence length: 2049, sample length: 2212 +[default0]:Skipping sample id=1524650. Maximum sequence length: 2049, sample length: 3324 +[default0]:Skipping sample id=190676. Maximum sequence length: 2049, sample length: 2437 +[default0]:Skipping sample id=140726. Maximum sequence length: 2049, sample length: 3213 +[default0]:Skipping sample id=145439. Maximum sequence length: 2049, sample length: 6448 +[default0]:Skipping sample id=1426077. Maximum sequence length: 2049, sample length: 3046 +[default0]:Skipping sample id=945449. Maximum sequence length: 2049, sample length: 4257 +[default0]:Skipping sample id=559105. Maximum sequence length: 2049, sample length: 3495 +[default0]:Skipping sample id=719855. Maximum sequence length: 2049, sample length: 4945 +[default0]:Skipping sample id=651243. Maximum sequence length: 2049, sample length: 2661 +[default0]:Skipping sample id=1084136. Maximum sequence length: 2049, sample length: 2090 +[default0]:Skipping sample id=314386. Maximum sequence length: 2049, sample length: 2123 +[default0]:Skipping sample id=1245404. Maximum sequence length: 2049, sample length: 3222 +[default0]:Skipping sample id=500695. Maximum sequence length: 2049, sample length: 2616 +[default0]:Skipping sample id=1475312. Maximum sequence length: 2049, sample length: 2947 +[default0]:Skipping sample id=1503745. Maximum sequence length: 2049, sample length: 2336 +[default0]:Skipping sample id=713891. Maximum sequence length: 2049, sample length: 3328 +[default0]:Skipping sample id=1365847. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=36819. Maximum sequence length: 2049, sample length: 3989 +[default0]:Skipping sample id=757003. Maximum sequence length: 2049, sample length: 2298 +[default0]:Skipping sample id=85030. Maximum sequence length: 2049, sample length: 5631 +[default0]:Skipping sample id=463642. Maximum sequence length: 2049, sample length: 2592 +[default0]:Skipping sample id=1348059. Maximum sequence length: 2049, sample length: 3351 +[default0]:Skipping sample id=1143854. Maximum sequence length: 2049, sample length: 2572 +[default0]:Skipping sample id=1416689. Maximum sequence length: 2049, sample length: 2096 +[default0]:Skipping sample id=538336. Maximum sequence length: 2049, sample length: 2779 +[default0]:Skipping sample id=1362612. Maximum sequence length: 2049, sample length: 3449 +[default0]:Skipping sample id=138290. Maximum sequence length: 2049, sample length: 2330 +[default0]:Skipping sample id=818038. Maximum sequence length: 2049, sample length: 2066 +[default0]:Skipping sample id=501749. Maximum sequence length: 2049, sample length: 2286 +[default0]:Skipping sample id=925176. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=1399314. Maximum sequence length: 2049, sample length: 2844 +[default0]:Skipping sample id=1220105. Maximum sequence length: 2049, sample length: 2212 +[default0]:Skipping sample id=326146. Maximum sequence length: 2049, sample length: 3686 +[default0]:Skipping sample id=1519267. Maximum sequence length: 2049, sample length: 2498 +[default0]:Skipping sample id=597431. Maximum sequence length: 2049, sample length: 3304 +[default0]:Skipping sample id=592316. Maximum sequence length: 2049, sample length: 3354 +[default0]:Skipping sample id=390397. Maximum sequence length: 2049, sample length: 2055 +[default0]:Skipping sample id=62739. Maximum sequence length: 2049, sample length: 3456 +[default0]:Skipping sample id=1261659. Maximum sequence length: 2049, sample length: 2249 +[default0]:Skipping sample id=489278. Maximum sequence length: 2049, sample length: 2498 +[default0]:Skipping sample id=1218098. Maximum sequence length: 2049, sample length: 2174 +[default0]:Skipping sample id=63425. Maximum sequence length: 2049, sample length: 2126 +[default0]:Skipping sample id=974472. Maximum sequence length: 2049, sample length: 4904 +[default0]:Skipping sample id=1200854. Maximum sequence length: 2049, sample length: 2140 +[default0]:Skipping sample id=461660. Maximum sequence length: 2049, sample length: 2107 +[default0]:Skipping sample id=1307775. Maximum sequence length: 2049, sample length: 4168 +[default0]:Skipping sample id=1192029. Maximum sequence length: 2049, sample length: 2268 +[default0]:Skipping sample id=708407. Maximum sequence length: 2049, sample length: 3030 +[default0]:Skipping sample id=500281. Maximum sequence length: 2049, sample length: 2545 +[default0]:Skipping sample id=611800. Maximum sequence length: 2049, sample length: 2949 +[default0]:Skipping sample id=1122030. Maximum sequence length: 2049, sample length: 3374 +[default0]:Skipping sample id=985422. Maximum sequence length: 2049, sample length: 4044 +[default0]:Skipping sample id=962442. Maximum sequence length: 2049, sample length: 2699 +[default0]:Skipping sample id=1093213. Maximum sequence length: 2049, sample length: 4554 +[default0]:Skipping sample id=116409. Maximum sequence length: 2049, sample length: 2279 +[default0]:Skipping sample id=1472570. Maximum sequence length: 2049, sample length: 2651 +[default0]:Skipping sample id=1306600. Maximum sequence length: 2049, sample length: 2247 +[default0]:Skipping sample id=1426378. Maximum sequence length: 2049, sample length: 2743 +[default0]:Skipping sample id=683008. Maximum sequence length: 2049, sample length: 2077 +[default0]:Skipping sample id=1518974. Maximum sequence length: 2049, sample length: 3854 +[default0]:Skipping sample id=884604. Maximum sequence length: 2049, sample length: 2936 +[default0]:Skipping sample id=1104116. Maximum sequence length: 2049, sample length: 3267 +[default0]:Skipping sample id=687074. Maximum sequence length: 2049, sample length: 2289 +[default0]:Skipping sample id=1439318. Maximum sequence length: 2049, sample length: 2394 +[default0]:Skipping sample id=1043639. Maximum sequence length: 2049, sample length: 2299 +[default0]:Skipping sample id=1332049. Maximum sequence length: 2049, sample length: 3286 +[default0]:Skipping sample id=1176339. Maximum sequence length: 2049, sample length: 2724 +[default0]:Skipping sample id=1281129. Maximum sequence length: 2049, sample length: 2336 +[default0]:Skipping sample id=1538900. Maximum sequence length: 2049, sample length: 3909 +[default0]:Skipping sample id=908805. Maximum sequence length: 2049, sample length: 2067 +[default0]:Skipping sample id=1215464. Maximum sequence length: 2049, sample length: 2806 +[default0]:Skipping sample id=486506. Maximum sequence length: 2049, sample length: 3325 +[default0]:Skipping sample id=1235838. Maximum sequence length: 2049, sample length: 2412 +[default0]:Skipping sample id=466012. Maximum sequence length: 2049, sample length: 5940 +[default0]:Skipping sample id=1341015. Maximum sequence length: 2049, sample length: 2351 +[default0]:Skipping sample id=1455055. Maximum sequence length: 2049, sample length: 2564 +[default0]:Skipping sample id=401202. Maximum sequence length: 2049, sample length: 2549 +[default0]:Skipping sample id=808600. Maximum sequence length: 2049, sample length: 3207 +[default0]:Skipping sample id=494581. Maximum sequence length: 2049, sample length: 2839 +[default0]:Skipping sample id=67644. Maximum sequence length: 2049, sample length: 2216 +[default0]:Skipping sample id=1126187. Maximum sequence length: 2049, sample length: 2994 +[default0]:Skipping sample id=138523. Maximum sequence length: 2049, sample length: 2233 +[default0]:Skipping sample id=777178. Maximum sequence length: 2049, sample length: 3705 +[default0]:Skipping sample id=1337216. Maximum sequence length: 2049, sample length: 2339 +[default0]:Skipping sample id=8133. Maximum sequence length: 2049, sample length: 2674 +[default0]:Skipping sample id=1465405. Maximum sequence length: 2049, sample length: 2590 +[default0]:Skipping sample id=1135164. Maximum sequence length: 2049, sample length: 2769 +[default0]:Skipping sample id=321688. Maximum sequence length: 2049, sample length: 4130 +[default0]:Skipping sample id=537089. Maximum sequence length: 2049, sample length: 2249 +[default0]:Skipping sample id=65939. Maximum sequence length: 2049, sample length: 2393 +[default0]:Skipping sample id=786499. Maximum sequence length: 2049, sample length: 2411 +[default0]:Skipping sample id=81454. Maximum sequence length: 2049, sample length: 2652 +[default0]:Skipping sample id=704083. Maximum sequence length: 2049, sample length: 4116 +[default0]:Skipping sample id=1255624. Maximum sequence length: 2049, sample length: 2632 +[default0]:Skipping sample id=1492270. Maximum sequence length: 2049, sample length: 4114 +[default0]:Skipping sample id=1313193. Maximum sequence length: 2049, sample length: 3885 +[default0]:Skipping sample id=701600. Maximum sequence length: 2049, sample length: 4385 +[default0]:Skipping sample id=122608. Maximum sequence length: 2049, sample length: 2232 +[default0]:Skipping sample id=918950. Maximum sequence length: 2049, sample length: 2445 +[default0]:Skipping sample id=1529370. Maximum sequence length: 2049, sample length: 2787 +[default0]:Skipping sample id=745387. Maximum sequence length: 2049, sample length: 2074 +[default0]:Skipping sample id=302215. Maximum sequence length: 2049, sample length: 4151 +[default0]:Skipping sample id=844904. Maximum sequence length: 2049, sample length: 2596 +[default0]:Skipping sample id=1410887. Maximum sequence length: 2049, sample length: 2062 +[default0]:Skipping sample id=762032. Maximum sequence length: 2049, sample length: 2214 +[default0]:Skipping sample id=484645. Maximum sequence length: 2049, sample length: 4587 +[default0]:Skipping sample id=957798. Maximum sequence length: 2049, sample length: 3085 +[default0]:Skipping sample id=1333725. Maximum sequence length: 2049, sample length: 5374 +[default0]:Skipping sample id=160007. Maximum sequence length: 2049, sample length: 2075 +[default0]:Skipping sample id=851234. Maximum sequence length: 2049, sample length: 3457 +[default0]:Skipping sample id=624352. Maximum sequence length: 2049, sample length: 2337 +[default0]:Skipping sample id=896184. Maximum sequence length: 2049, sample length: 2588 +[default0]:Skipping sample id=1127703. Maximum sequence length: 2049, sample length: 2529 +[default0]:Skipping sample id=877451. Maximum sequence length: 2049, sample length: 2212 +[default0]:Skipping sample id=597415. Maximum sequence length: 2049, sample length: 2138 +[default0]:Skipping sample id=1565457. Maximum sequence length: 2049, sample length: 2980 +[default0]:Skipping sample id=1223834. Maximum sequence length: 2049, sample length: 2504 +[default0]:Skipping sample id=458395. Maximum sequence length: 2049, sample length: 3940 +[default0]:Skipping sample id=820528. Maximum sequence length: 2049, sample length: 2084 +[default0]:Skipping sample id=372184. Maximum sequence length: 2049, sample length: 2108 +[default0]:Skipping sample id=263248. Maximum sequence length: 2049, sample length: 2568 +[default0]:Skipping sample id=607787. Maximum sequence length: 2049, sample length: 2247 +[default0]:Skipping sample id=89156. Maximum sequence length: 2049, sample length: 2275 +[default0]:Skipping sample id=1498896. Maximum sequence length: 2049, sample length: 2080 +[default0]:Skipping sample id=973530. Maximum sequence length: 2049, sample length: 2185 +[default0]:Skipping sample id=1532474. Maximum sequence length: 2049, sample length: 4003 +[default0]:Skipping sample id=1550995. Maximum sequence length: 2049, sample length: 2451 +[default0]:Skipping sample id=791129. Maximum sequence length: 2049, sample length: 2385 +[default0]:Skipping sample id=357520. Maximum sequence length: 2049, sample length: 4605 +[default0]:Skipping sample id=901428. Maximum sequence length: 2049, sample length: 3896 +[default0]:Skipping sample id=782064. Maximum sequence length: 2049, sample length: 2607 +[default0]:Skipping sample id=1489595. Maximum sequence length: 2049, sample length: 2133 +[default0]:Skipping sample id=294290. Maximum sequence length: 2049, sample length: 3238 +[default0]:Skipping sample id=585553. Maximum sequence length: 2049, sample length: 2631 +[default0]:Skipping sample id=763748. Maximum sequence length: 2049, sample length: 2584 +[default0]:Skipping sample id=900870. Maximum sequence length: 2049, sample length: 2937 +[default0]:Skipping sample id=680477. Maximum sequence length: 2049, sample length: 2297 +[default0]:Skipping sample id=1257279. Maximum sequence length: 2049, sample length: 2469 +[default0]:Skipping sample id=334107. Maximum sequence length: 2049, sample length: 2732 +[default0]:Skipping sample id=1023519. Maximum sequence length: 2049, sample length: 3869 +[default0]:Skipping sample id=276476. Maximum sequence length: 2049, sample length: 2965 +[default0]:Skipping sample id=979447. Maximum sequence length: 2049, sample length: 5222 +[default0]:Skipping sample id=1426038. Maximum sequence length: 2049, sample length: 2129 +[default0]:Skipping sample id=259058. Maximum sequence length: 2049, sample length: 2775 +[default0]:Skipping sample id=1153932. Maximum sequence length: 2049, sample length: 6817 +[default0]:Skipping sample id=974909. Maximum sequence length: 2049, sample length: 2767 +[default0]:Skipping sample id=474964. Maximum sequence length: 2049, sample length: 2387 +[default0]:Skipping sample id=619573. Maximum sequence length: 2049, sample length: 3344 +[default0]:Skipping sample id=1155449. Maximum sequence length: 2049, sample length: 2165 +[default0]:Skipping sample id=1250340. Maximum sequence length: 2049, sample length: 2702 +[default0]:Skipping sample id=1220977. Maximum sequence length: 2049, sample length: 2135 +[default0]:Skipping sample id=339310. Maximum sequence length: 2049, sample length: 2388 +[default0]:Skipping sample id=590871. Maximum sequence length: 2049, sample length: 4214 +[default0]:Skipping sample id=353107. Maximum sequence length: 2049, sample length: 4208 +[default0]:Skipping sample id=346214. Maximum sequence length: 2049, sample length: 4956 +[default0]:Skipping sample id=555833. Maximum sequence length: 2049, sample length: 3375 +[default0]:Skipping sample id=1110973. Maximum sequence length: 2049, sample length: 2450 +[default0]:Skipping sample id=679350. Maximum sequence length: 2049, sample length: 2870 +[default0]:Skipping sample id=892923. Maximum sequence length: 2049, sample length: 2179 +[default0]:Skipping sample id=137596. Maximum sequence length: 2049, sample length: 3022 +[default0]:Skipping sample id=1521607. Maximum sequence length: 2049, sample length: 2400 +[default0]:Skipping sample id=1265615. Maximum sequence length: 2049, sample length: 2462 +[default0]:Skipping sample id=686127. Maximum sequence length: 2049, sample length: 4140 +[default0]:Skipping sample id=1309069. Maximum sequence length: 2049, sample length: 2450 +[default0]:Skipping sample id=765033. Maximum sequence length: 2049, sample length: 6960 +[default0]:Skipping sample id=581716. Maximum sequence length: 2049, sample length: 4120 +[default0]:Skipping sample id=929186. Maximum sequence length: 2049, sample length: 2719 +[default0]:Skipping sample id=1482338. Maximum sequence length: 2049, sample length: 2383 +[default0]:Skipping sample id=666396. Maximum sequence length: 2049, sample length: 2062 +[default0]:Skipping sample id=635093. Maximum sequence length: 2049, sample length: 4358 +[default0]:Skipping sample id=356944. Maximum sequence length: 2049, sample length: 2236 +[default0]:Skipping sample id=850154. Maximum sequence length: 2049, sample length: 2730 +[default0]:Skipping sample id=1427376. Maximum sequence length: 2049, sample length: 3553 +[default0]:Skipping sample id=1391814. Maximum sequence length: 2049, sample length: 2172 +[default0]:Skipping sample id=702959. Maximum sequence length: 2049, sample length: 3094 +[default0]:Skipping sample id=1275473. Maximum sequence length: 2049, sample length: 2730 +[default0]:Skipping sample id=626763. Maximum sequence length: 2049, sample length: 2261 +[default0]:Skipping sample id=1415540. Maximum sequence length: 2049, sample length: 2734 +[default0]:Skipping sample id=608531. Maximum sequence length: 2049, sample length: 2498 +[default0]:Skipping sample id=323763. Maximum sequence length: 2049, sample length: 2219 +[default0]:Skipping sample id=705824. Maximum sequence length: 2049, sample length: 3056 +[default0]:Skipping sample id=1410053. Maximum sequence length: 2049, sample length: 2338 +[default0]:Skipping sample id=256051. Maximum sequence length: 2049, sample length: 3656 +[default0]:Skipping sample id=953566. Maximum sequence length: 2049, sample length: 3405 +[default0]:Skipping sample id=307672. Maximum sequence length: 2049, sample length: 2232 +[default0]:Skipping sample id=1113334. Maximum sequence length: 2049, sample length: 3230 +[default0]:Skipping sample id=1502726. Maximum sequence length: 2049, sample length: 4318 +[default0]:Skipping sample id=1361086. Maximum sequence length: 2049, sample length: 2288 +[default0]:Skipping sample id=1229999. Maximum sequence length: 2049, sample length: 2859 +[default0]:Skipping sample id=1343636. Maximum sequence length: 2049, sample length: 5235 +[default0]:Skipping sample id=363227. Maximum sequence length: 2049, sample length: 2172 +[default0]:Skipping sample id=1185604. Maximum sequence length: 2049, sample length: 2555 +[default0]:Skipping sample id=607990. Maximum sequence length: 2049, sample length: 2440 +[default0]:Skipping sample id=283407. Maximum sequence length: 2049, sample length: 2189 +[default0]:Skipping sample id=570543. Maximum sequence length: 2049, sample length: 2892 +[default0]:Skipping sample id=1477207. Maximum sequence length: 2049, sample length: 4910 +[default0]:Skipping sample id=1344922. Maximum sequence length: 2049, sample length: 2596 +[default0]:Skipping sample id=742879. Maximum sequence length: 2049, sample length: 3508 +[default0]:Skipping sample id=650868. Maximum sequence length: 2049, sample length: 2555 +[default0]:Skipping sample id=1395896. Maximum sequence length: 2049, sample length: 3351 +[default0]:Skipping sample id=1192857. Maximum sequence length: 2049, sample length: 2316 +[default0]:Skipping sample id=1317488. Maximum sequence length: 2049, sample length: 4138 +[default0]:Skipping sample id=628568. Maximum sequence length: 2049, sample length: 3114 +[default0]:Skipping sample id=23830. Maximum sequence length: 2049, sample length: 4786 +[default0]:Skipping sample id=1247948. Maximum sequence length: 2049, sample length: 2250 +[default0]:Skipping sample id=874289. Maximum sequence length: 2049, sample length: 2821 +[default0]:Skipping sample id=438702. Maximum sequence length: 2049, sample length: 5304 +[default0]:Skipping sample id=1243226. Maximum sequence length: 2049, sample length: 2760 +[default0]:Skipping sample id=1056129. Maximum sequence length: 2049, sample length: 2977 +[default0]:Skipping sample id=192233. Maximum sequence length: 2049, sample length: 2351 +[default0]:Skipping sample id=132895. Maximum sequence length: 2049, sample length: 2570 +[default0]:Skipping sample id=434409. Maximum sequence length: 2049, sample length: 3566 +[default0]:Skipping sample id=19295. Maximum sequence length: 2049, sample length: 3509 +[default0]:Skipping sample id=203841. Maximum sequence length: 2049, sample length: 2742 +[default0]:Skipping sample id=1325186. Maximum sequence length: 2049, sample length: 2612 +[default0]:Skipping sample id=98235. Maximum sequence length: 2049, sample length: 5006 +[default0]:Skipping sample id=7293. Maximum sequence length: 2049, sample length: 3231 +[default0]:Skipping sample id=679795. Maximum sequence length: 2049, sample length: 2406 +[default0]:Skipping sample id=368298. Maximum sequence length: 2049, sample length: 3715 +[default0]:Skipping sample id=638521. Maximum sequence length: 2049, sample length: 2070 +[default0]:Skipping sample id=1347708. Maximum sequence length: 2049, sample length: 3766 +[default0]:Skipping sample id=1144173. Maximum sequence length: 2049, sample length: 2337 +[default0]:Skipping sample id=375865. Maximum sequence length: 2049, sample length: 2772 +[default0]:Skipping sample id=23038. Maximum sequence length: 2049, sample length: 2249 +[default0]:Skipping sample id=1279618. Maximum sequence length: 2049, sample length: 3653 +[default0]:Skipping sample id=648180. Maximum sequence length: 2049, sample length: 2083 +[default0]:Skipping sample id=1349611. Maximum sequence length: 2049, sample length: 2076 +[default0]:Skipping sample id=632695. Maximum sequence length: 2049, sample length: 3332 +[default0]:Skipping sample id=1168703. Maximum sequence length: 2049, sample length: 2369 +[default0]:Skipping sample id=200772. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=917174. Maximum sequence length: 2049, sample length: 3323 +[default0]:Skipping sample id=1399996. Maximum sequence length: 2049, sample length: 3168 +[default0]:Skipping sample id=222811. Maximum sequence length: 2049, sample length: 2521 +[default0]:Skipping sample id=346653. Maximum sequence length: 2049, sample length: 2913 +[default0]:Skipping sample id=523049. Maximum sequence length: 2049, sample length: 2820 +[default0]:Skipping sample id=872689. Maximum sequence length: 2049, sample length: 2365 +[default0]:Skipping sample id=879871. Maximum sequence length: 2049, sample length: 2556 +[default0]:Skipping sample id=1446453. Maximum sequence length: 2049, sample length: 2440 +[default0]:Skipping sample id=202587. Maximum sequence length: 2049, sample length: 2251 +[default0]:Skipping sample id=75909. Maximum sequence length: 2049, sample length: 4349 +[default0]:Skipping sample id=849277. Maximum sequence length: 2049, sample length: 2928 +[default0]:Skipping sample id=79874. Maximum sequence length: 2049, sample length: 2110 +[default0]:Skipping sample id=202032. Maximum sequence length: 2049, sample length: 2388 +[default0]:Skipping sample id=1204249. Maximum sequence length: 2049, sample length: 7265 +[default0]:Skipping sample id=352401. Maximum sequence length: 2049, sample length: 2107 +[default0]:Skipping sample id=315795. Maximum sequence length: 2049, sample length: 3313 +[default0]:Skipping sample id=418908. Maximum sequence length: 2049, sample length: 2859 +[default0]:Skipping sample id=807463. Maximum sequence length: 2049, sample length: 2097 +[default0]:Skipping sample id=90930. Maximum sequence length: 2049, sample length: 2369 +[default0]:Skipping sample id=1132516. Maximum sequence length: 2049, sample length: 6799 +[default0]:Skipping sample id=1092797. Maximum sequence length: 2049, sample length: 2431 +[default0]:Skipping sample id=269275. Maximum sequence length: 2049, sample length: 2237 +[default0]:Skipping sample id=1354539. Maximum sequence length: 2049, sample length: 2953 +[default0]:Skipping sample id=747424. Maximum sequence length: 2049, sample length: 2111 +[default0]:Skipping sample id=1138368. Maximum sequence length: 2049, sample length: 3371 +[default0]:Skipping sample id=1022225. Maximum sequence length: 2049, sample length: 2516 +[default0]:Skipping sample id=1127893. Maximum sequence length: 2049, sample length: 2550 +[default0]:Skipping sample id=1546675. Maximum sequence length: 2049, sample length: 2413 +[default0]:Skipping sample id=1225135. Maximum sequence length: 2049, sample length: 2065 +[default0]:Skipping sample id=1368572. Maximum sequence length: 2049, sample length: 2092 +[default0]:Skipping sample id=818180. Maximum sequence length: 2049, sample length: 2941 +[default0]:Skipping sample id=438824. Maximum sequence length: 2049, sample length: 3985 +[default0]:Skipping sample id=1210340. Maximum sequence length: 2049, sample length: 2202 +[default0]:Skipping sample id=213869. Maximum sequence length: 2049, sample length: 3757 +[default0]:Skipping sample id=1454823. Maximum sequence length: 2049, sample length: 2123 +[default0]:Skipping sample id=1440846. Maximum sequence length: 2049, sample length: 2631 +[default0]:Skipping sample id=340791. Maximum sequence length: 2049, sample length: 2600 +[default0]:Skipping sample id=1492314. Maximum sequence length: 2049, sample length: 2090 +[default0]:Skipping sample id=248585. Maximum sequence length: 2049, sample length: 2124 +[default0]:Skipping sample id=1523342. Maximum sequence length: 2049, sample length: 3976 +[default0]:Skipping sample id=143954. Maximum sequence length: 2049, sample length: 2250 +[default0]:Skipping sample id=168482. Maximum sequence length: 2049, sample length: 3031 +[default0]:Skipping sample id=337468. Maximum sequence length: 2049, sample length: 3381 +[default0]:Skipping sample id=716300. Maximum sequence length: 2049, sample length: 6707 +[default0]:Skipping sample id=28634. Maximum sequence length: 2049, sample length: 3971 +[default0]:Skipping sample id=624225. Maximum sequence length: 2049, sample length: 2124 +[default0]:Skipping sample id=1540978. Maximum sequence length: 2049, sample length: 4606 +[default0]:Skipping sample id=1563880. Maximum sequence length: 2049, sample length: 3601 +[default0]:Skipping sample id=130112. Maximum sequence length: 2049, sample length: 2167 +[default0]:Skipping sample id=1521275. Maximum sequence length: 2049, sample length: 3187 +[default0]:Skipping sample id=1183132. Maximum sequence length: 2049, sample length: 2072 +[default0]:Skipping sample id=1263814. Maximum sequence length: 2049, sample length: 2257 +[default0]:Skipping sample id=593184. Maximum sequence length: 2049, sample length: 2201 +[default0]:Skipping sample id=653980. Maximum sequence length: 2049, sample length: 2152 +[default0]:Skipping sample id=210704. Maximum sequence length: 2049, sample length: 4748 +[default0]:Skipping sample id=1113088. Maximum sequence length: 2049, sample length: 3544 +[default0]:Skipping sample id=1444878. Maximum sequence length: 2049, sample length: 2368 +[default0]:Skipping sample id=992711. Maximum sequence length: 2049, sample length: 2226 +[default0]:Skipping sample id=1132934. Maximum sequence length: 2049, sample length: 2175 +[default0]:Skipping sample id=134974. Maximum sequence length: 2049, sample length: 3051 +[default0]:Skipping sample id=1492625. Maximum sequence length: 2049, sample length: 2454 +[default0]:Skipping sample id=1124487. Maximum sequence length: 2049, sample length: 2153 +[default0]:Skipping sample id=443884. Maximum sequence length: 2049, sample length: 3487 +[default0]:Skipping sample id=1055033. Maximum sequence length: 2049, sample length: 2875 +[default0]:Skipping sample id=369432. Maximum sequence length: 2049, sample length: 2343 +[default0]:Skipping sample id=1355824. Maximum sequence length: 2049, sample length: 2630 +[default0]:Skipping sample id=680047. Maximum sequence length: 2049, sample length: 3393 +[default0]:Skipping sample id=1027903. Maximum sequence length: 2049, sample length: 2871 +[default0]:Skipping sample id=1492669. Maximum sequence length: 2049, sample length: 2247 +[default0]:Skipping sample id=149333. Maximum sequence length: 2049, sample length: 2224 +[default0]:Skipping sample id=872402. Maximum sequence length: 2049, sample length: 2101 +[default0]:Skipping sample id=904890. Maximum sequence length: 2049, sample length: 3194 +[default0]:Skipping sample id=3212. Maximum sequence length: 2049, sample length: 4138 +[default0]:Skipping sample id=1197091. Maximum sequence length: 2049, sample length: 2251 +[default0]:Skipping sample id=822805. Maximum sequence length: 2049, sample length: 2600 +[default0]:Skipping sample id=1385140. Maximum sequence length: 2049, sample length: 3274 +[default0]:Skipping sample id=1134191. Maximum sequence length: 2049, sample length: 2883 +[default0]:Skipping sample id=750909. Maximum sequence length: 2049, sample length: 2524 +[default0]:Skipping sample id=841172. Maximum sequence length: 2049, sample length: 2516 +[default0]:Skipping sample id=132957. Maximum sequence length: 2049, sample length: 2506 +[default0]:Skipping sample id=224402. Maximum sequence length: 2049, sample length: 2676 +[default0]:Skipping sample id=1277071. Maximum sequence length: 2049, sample length: 3279 +[default0]:Skipping sample id=1492704. Maximum sequence length: 2049, sample length: 2316 +[default0]:Skipping sample id=854967. Maximum sequence length: 2049, sample length: 2219 +[default0]:Skipping sample id=748870. Maximum sequence length: 2049, sample length: 4736 +[default0]:Skipping sample id=1443828. Maximum sequence length: 2049, sample length: 2563 +[default0]:Skipping sample id=1100708. Maximum sequence length: 2049, sample length: 3046 +[default0]:Skipping sample id=797700. Maximum sequence length: 2049, sample length: 2087 +[default0]:Skipping sample id=1470973. Maximum sequence length: 2049, sample length: 2282 +[default0]:Skipping sample id=1473042. Maximum sequence length: 2049, sample length: 2119 +[default0]:Skipping sample id=20968. Maximum sequence length: 2049, sample length: 2235 +[default0]:Skipping sample id=982099. Maximum sequence length: 2049, sample length: 2115 +[default0]:Skipping sample id=20946. Maximum sequence length: 2049, sample length: 2206 +[default0]:Skipping sample id=1270612. Maximum sequence length: 2049, sample length: 3167 +[default0]:Skipping sample id=441209. Maximum sequence length: 2049, sample length: 2290 +[default0]:Skipping sample id=1528914. Maximum sequence length: 2049, sample length: 3105 +[default0]:Skipping sample id=1104188. Maximum sequence length: 2049, sample length: 4293 +[default0]:Skipping sample id=1569615. Maximum sequence length: 2049, sample length: 2538 +[default0]:Skipping sample id=1300457. Maximum sequence length: 2049, sample length: 2401 +[default0]:Skipping sample id=1295125. Maximum sequence length: 2049, sample length: 2252 +[default0]:Skipping sample id=762922. Maximum sequence length: 2049, sample length: 6097 +[default0]:Skipping sample id=684659. Maximum sequence length: 2049, sample length: 2317 +[default0]:Skipping sample id=1384439. Maximum sequence length: 2049, sample length: 2218 +[default0]:Skipping sample id=1563482. Maximum sequence length: 2049, sample length: 3157 +[default0]:Skipping sample id=1150311. Maximum sequence length: 2049, sample length: 3191 +[default0]:Skipping sample id=164244. Maximum sequence length: 2049, sample length: 2261 +[default0]:Skipping sample id=505421. Maximum sequence length: 2049, sample length: 3000 +[default0]:Skipping sample id=645499. Maximum sequence length: 2049, sample length: 2147 +[default0]:Skipping sample id=810506. Maximum sequence length: 2049, sample length: 2221 +[default0]:Skipping sample id=1099044. Maximum sequence length: 2049, sample length: 5670 +[default0]:Skipping sample id=794042. Maximum sequence length: 2049, sample length: 2170 +[default0]:Skipping sample id=1440346. Maximum sequence length: 2049, sample length: 3303 +[default0]:Skipping sample id=1350816. Maximum sequence length: 2049, sample length: 2224 +[default0]:Skipping sample id=1330805. Maximum sequence length: 2049, sample length: 2341 +[default0]:Skipping sample id=342553. Maximum sequence length: 2049, sample length: 2935 +[default0]:Skipping sample id=1461177. Maximum sequence length: 2049, sample length: 2866 +[default0]:Skipping sample id=431322. Maximum sequence length: 2049, sample length: 2531 +[default0]:Skipping sample id=1246735. Maximum sequence length: 2049, sample length: 4246 +[default0]:Skipping sample id=1010165. Maximum sequence length: 2049, sample length: 2637 +[default0]:Skipping sample id=1459089. Maximum sequence length: 2049, sample length: 4276 +[default0]:Skipping sample id=440210. Maximum sequence length: 2049, sample length: 3376 +[default0]:Skipping sample id=1321024. Maximum sequence length: 2049, sample length: 2144 +[default0]:Skipping sample id=1393453. Maximum sequence length: 2049, sample length: 3064 +[default0]:Skipping sample id=72095. Maximum sequence length: 2049, sample length: 3097 +[default0]:Skipping sample id=1175189. Maximum sequence length: 2049, sample length: 2118 +[default0]:Skipping sample id=107449. Maximum sequence length: 2049, sample length: 2716 +[default0]:Skipping sample id=934550. Maximum sequence length: 2049, sample length: 2833 +[default0]:Skipping sample id=1361425. Maximum sequence length: 2049, sample length: 2213 +[default0]:Skipping sample id=132886. Maximum sequence length: 2049, sample length: 2420 +[default0]:Skipping sample id=576515. Maximum sequence length: 2049, sample length: 2375 +[default0]:Skipping sample id=283006. Maximum sequence length: 2049, sample length: 3081 +[default0]:Skipping sample id=1000460. Maximum sequence length: 2049, sample length: 5754 +[default0]:Skipping sample id=270170. Maximum sequence length: 2049, sample length: 4643 +[default0]:Skipping sample id=743088. Maximum sequence length: 2049, sample length: 2050 +[default0]:Skipping sample id=273586. Maximum sequence length: 2049, sample length: 4071 +[default0]:Skipping sample id=1334536. Maximum sequence length: 2049, sample length: 3128 +[default0]:Skipping sample id=288816. Maximum sequence length: 2049, sample length: 3280 +[default0]:Skipping sample id=635329. Maximum sequence length: 2049, sample length: 3638 +[default0]:Skipping sample id=1204798. Maximum sequence length: 2049, sample length: 2198 +[default0]:Skipping sample id=97852. Maximum sequence length: 2049, sample length: 3387 +[default0]:Skipping sample id=175006. Maximum sequence length: 2049, sample length: 3548 +[default0]:Skipping sample id=283780. Maximum sequence length: 2049, sample length: 2174 +[default0]:Skipping sample id=943115. Maximum sequence length: 2049, sample length: 3333 +[default0]:Skipping sample id=649438. Maximum sequence length: 2049, sample length: 2396 +[default0]:Skipping sample id=552076. Maximum sequence length: 2049, sample length: 2487 +[default0]:Skipping sample id=110017. Maximum sequence length: 2049, sample length: 2616 +[default0]:Skipping sample id=66150. Maximum sequence length: 2049, sample length: 2514 +[default0]:Skipping sample id=77454. Maximum sequence length: 2049, sample length: 2139 +[default0]:Skipping sample id=943026. Maximum sequence length: 2049, sample length: 2441 +[default0]:Skipping sample id=1083136. Maximum sequence length: 2049, sample length: 2206 +[default0]:Skipping sample id=1305794. Maximum sequence length: 2049, sample length: 2066 +[default0]:Skipping sample id=433027. Maximum sequence length: 2049, sample length: 2163 +[default0]:Skipping sample id=275987. Maximum sequence length: 2049, sample length: 3996 +[default0]:Skipping sample id=208124. Maximum sequence length: 2049, sample length: 3195 +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 4.506038 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_51797ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_51797ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002809 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4893782, 5151349) total of 257567 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004195 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003422 seconds +[default0]: number of documents: 5151349 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.675867 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_10526ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_10526ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003725 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3384633, 3562772) total of 178139 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006915 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003049 seconds +[default0]: number of documents: 3562772 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.646958 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_8521ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_8521ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006298 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2572338, 2707724) total of 135386 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010460 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003795 seconds +[default0]: number of documents: 2707724 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]:Skipping sample id=120742. Maximum sequence length: 2049, sample length: 2101 +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.386600 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_7827ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_7827ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010881 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4803145, 5055942) total of 252797 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004565 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003041 seconds +[default0]: number of documents: 5055942 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.635300 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_7711ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_7711ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006549 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2041507, 2148955) total of 107448 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007375 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004765 seconds +[default0]: number of documents: 2148955 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.454499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_6502ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_6502ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007397 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2496022, 2627392) total of 131370 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008052 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003816 seconds +[default0]: number of documents: 2627392 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.353576 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_6404ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_6404ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007236 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3382528, 3560556) total of 178028 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008328 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003633 seconds +[default0]: number of documents: 3560556 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]:Skipping sample id=37793. Maximum sequence length: 2049, sample length: 2120 +[default0]:Skipping sample id=14452. Maximum sequence length: 2049, sample length: 2110 +[default0]:Skipping sample id=13611. Maximum sequence length: 2049, sample length: 2156 +[default0]:Skipping sample id=169104. Maximum sequence length: 2049, sample length: 2073 +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.461827 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_6381ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_6381ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004413 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1466269, 1543441) total of 77172 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007236 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004263 seconds +[default0]: number of documents: 1543441 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.377170 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_6108ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_6108ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007313 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1583941, 1667306) total of 83365 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007605 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003782 seconds +[default0]: number of documents: 1667306 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.381993 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_4312ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_4312ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007800 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [812968, 855756) total of 42788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009587 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003613 seconds +[default0]: number of documents: 855756 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.277906 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_2667ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_2667ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005136 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [544696, 573364) total of 28668 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005695 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002271 seconds +[default0]: number of documents: 573364 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.215396 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_1853ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_1853ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004277 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [390101, 410633) total of 20532 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003817 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002105 seconds +[default0]: number of documents: 410633 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.055418 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_1286ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_1286ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005721 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [407401, 428843) total of 21442 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002419 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001397 seconds +[default0]: number of documents: 428843 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.057026 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_1158ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_1158ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004469 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [396406, 417269) total of 20863 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004011 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001515 seconds +[default0]: number of documents: 417269 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.055069 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_927ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_927ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008190 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1058732, 1114455) total of 55723 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007675 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004144 seconds +[default0]: number of documents: 1114455 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.136562 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_780ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_780ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003160 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [330124, 347499) total of 17375 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003177 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001327 seconds +[default0]: number of documents: 347499 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.047133 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_780ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_780ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003586 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [322250, 339210) total of 16960 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003148 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001225 seconds +[default0]: number of documents: 339210 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.045723 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_728ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_728ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003475 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [299966, 315754) total of 15788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003205 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001177 seconds +[default0]: number of documents: 315754 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.041086 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_548ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_548ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007276 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [872495, 918416) total of 45921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008182 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003737 seconds +[default0]: number of documents: 918416 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.112706 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_491ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_491ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008185 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [902592, 950097) total of 47505 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008938 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003561 seconds +[default0]: number of documents: 950097 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.114292 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_453ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_453ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008430 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869310, 915063) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008750 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003707 seconds +[default0]: number of documents: 915063 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.266968 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_379ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_379ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008215 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869308, 915061) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008245 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003685 seconds +[default0]: number of documents: 915061 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.109154 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_367ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_367ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008305 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869305, 915058) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008326 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003583 seconds +[default0]: number of documents: 915058 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.107368 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_355ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_355ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008636 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [821803, 865056) total of 43253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008495 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003627 seconds +[default0]: number of documents: 865056 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.103500 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_351ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_351ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007642 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869292, 915044) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007979 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004370 seconds +[default0]: number of documents: 915044 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.108561 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_348ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_348ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007986 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869291, 915043) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008999 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004950 seconds +[default0]: number of documents: 915043 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.107925 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_347ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_347ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008512 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869270, 915021) total of 45751 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008770 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005177 seconds +[default0]: number of documents: 915021 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.108130 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_328ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_328ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008064 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869301, 915054) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008075 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003588 seconds +[default0]: number of documents: 915054 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.108005 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_328ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_328ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008606 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869298, 915051) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006143 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003869 seconds +[default0]: number of documents: 915051 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.108586 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_320ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_320ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004221 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [302280, 318189) total of 15909 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001815 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001324 seconds +[default0]: number of documents: 318189 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.041626 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_269ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_269ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002562 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [252571, 265864) total of 13293 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002659 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000873 seconds +[default0]: number of documents: 265864 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.034460 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_244ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_244ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003867 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002694 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000846 seconds +[default0]: number of documents: 265063 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.033873 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_231ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_231ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003299 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002295 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000816 seconds +[default0]: number of documents: 265063 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.034334 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_225ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_225ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002800 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002332 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000785 seconds +[default0]: number of documents: 265063 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.033718 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_218ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_218ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003379 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346807, 365060) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003211 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000953 seconds +[default0]: number of documents: 365060 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.046594 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_176ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_176ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048804 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346810, 365063) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.027232 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001300 seconds +[default0]: number of documents: 365063 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.045367 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_171ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_171ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002759 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002638 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001004 seconds +[default0]: number of documents: 265063 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.035362 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_162ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_162ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002617 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002347 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000847 seconds +[default0]: number of documents: 265180 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.176208 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_160ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_160ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002754 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002421 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000763 seconds +[default0]: number of documents: 265063 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.034511 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_159ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_159ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002960 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002574 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000857 seconds +[default0]: number of documents: 265063 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.034330 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_158ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_158ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002886 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [257631, 271191) total of 13560 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002742 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000862 seconds +[default0]: number of documents: 271191 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.035129 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_154ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_154ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002953 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [256474, 269973) total of 13499 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002002 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000910 seconds +[default0]: number of documents: 269973 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.034893 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_151ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_151ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002702 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002353 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000854 seconds +[default0]: number of documents: 265071 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.034686 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_151ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_151ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002763 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002412 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000856 seconds +[default0]: number of documents: 265180 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.034038 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_149ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_149ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002513 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002508 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000953 seconds +[default0]: number of documents: 265071 +[default0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[default0]: > elasped time to build and save shuffle-idx and sample-idx mapping (seconds): 0.034551 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_148ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_148ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.001 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387164 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786748 +[default0]: dataset 2, input: 0.0636898, achieved: 0.06369 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584988 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576333 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485993 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478607 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476915 +[default0]: dataset 8, input: 0.045653, achieved: 0.0456527 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322258 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199317 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138501 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960523 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865317 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692215 +[default0]: dataset 15, input: 0.00582803, achieved: 0.00582776 +[default0]: dataset 16, input: 0.00582586, achieved: 0.00582584 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543732 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409097 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366591 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337933 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282733 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00274078 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264654 +[default0]: dataset 24, input: 0.00262358, achieved: 0.00262346 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00260038 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259076 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245228 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244651 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238688 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200606 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181949 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171948 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167716 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162331 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131173 +[default0]: dataset 36, input: 0.00127347, achieved: 0.00127326 +[default0]: dataset 37, input: 0.00120564, achieved: 0.00120594 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119633 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118479 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117517 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00115017 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112324 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112324 +[default0]: dataset 44, input: 0.00111237, achieved: 0.0011117 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110401 +[default0]:> elapsed time for building blendable dataset indices: 0.03 (sec) +[default0]:> finished creating T0 datasets ... +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default1]:[001-001] 2.2365B / 1.2089B +[default0]:[000-001] 2.2365B / 1.2089B +[default7]:time (ms) | model-and-optimizer-setup: 16087.25 | train/valid/test-data-iterators-setup: 30263.13 +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default5]: iteration = train(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default5]: train_step(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default5]: loss = model[0].train_batch(data_iter=data_iterator) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default5]: self._exec_schedule(sched) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default5]: self._exec_instr(**cmd.kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default5]: batch = self._next_batch() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default5]: batch = self.batch_fn(batch) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default5]: if 'text' in data: +[default5]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default5]: iteration = train(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default5]: train_step(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default5]: loss = model[0].train_batch(data_iter=data_iterator) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default5]: self._exec_schedule(sched) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default5]: self._exec_instr(**cmd.kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default5]: batch = self._next_batch() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default5]: batch = self.batch_fn(batch) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default5]: if 'text' in data: +[default5]:TypeError: argument of type 'NoneType' is not iterable +[default0]:[after dataloaders are built] datetime: 2022-10-07 09:02:26 +[default0]:done with setup ... +[default0]:training ... +[default0]:Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +[default0]:[000-000] 2.2365B / 1.2089B +[default0]:[before the start of training step] datetime: 2022-10-07 09:02:26 +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default7]: iteration = train(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default7]: train_step(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default7]: loss = model[0].train_batch(data_iter=data_iterator) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default7]: self._exec_schedule(sched) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default7]: self._exec_instr(**cmd.kwargs) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default7]: batch = self._next_batch() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default7]: batch = self.batch_fn(batch) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default7]: if 'text' in data: +[default7]:TypeError: argument of type 'NoneType' is not iterable +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default7]: iteration = train(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default7]: train_step(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default7]: loss = model[0].train_batch(data_iter=data_iterator) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default7]: self._exec_schedule(sched) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default7]: self._exec_instr(**cmd.kwargs) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default7]: batch = self._next_batch() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default7]: batch = self.batch_fn(batch) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default7]: if 'text' in data: +[default7]:TypeError: argument of type 'NoneType' is not iterable +[default1]:[001-000] 2.2365B / 1.2089B +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: batch = self.batch_fn(batch) +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default5]: iteration = train(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default5]: train_step(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default5]: loss = model[0].train_batch(data_iter=data_iterator) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default5]: self._exec_schedule(sched) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default5]: self._exec_instr(**cmd.kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default5]: batch = self._next_batch() +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default7]: iteration = train(forward_step_func, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default7]: train_step(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default7]: loss = model[0].train_batch(data_iter=data_iterator) +[default5]: batch = self.batch_fn(batch) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default5]: if 'text' in data: +[default5]:TypeError: argument of type 'NoneType' is not iterable +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default7]: self._exec_schedule(sched) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default7]: self._exec_instr(**cmd.kwargs) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default7]: batch = self._next_batch() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default7]: batch = self.batch_fn(batch) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default7]: if 'text' in data: +[default7]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default5]: iteration = train(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default5]: train_step(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default5]: loss = model[0].train_batch(data_iter=data_iterator) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default5]: self._exec_schedule(sched) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default5]: self._exec_instr(**cmd.kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default5]: batch = self._next_batch() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default5]: batch = self.batch_fn(batch) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default5]: if 'text' in data: +[default5]:TypeError: argument of type 'NoneType' is not iterable +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default7]: iteration = train(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default7]: train_step(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default7]: loss = model[0].train_batch(data_iter=data_iterator) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default7]: self._exec_schedule(sched) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default7]: self._exec_instr(**cmd.kwargs) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default7]: batch = self._next_batch() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default7]: batch = self.batch_fn(batch) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default7]: if 'text' in data: +[default7]:TypeError: argument of type 'NoneType' is not iterable +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 347147 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 347149 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 347151 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 347153 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365737 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365739 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365741 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 347100 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 347102 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 365743 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 347104 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 347106 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 375644 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 375646 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 375648 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 375650 closing signal SIGTERM +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 347148) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 347101) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 365738) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 375645) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 300.48546600341797 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam41-ib0 + rank : 27 (local_rank: 3) + exitcode : 1 (pid: 365740) + error_file: /tmp/torchelastic_abjm4rlg/none_llm9bmq2/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +[2]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam41-ib0 + rank : 29 (local_rank: 5) + exitcode : 1 (pid: 365742) + error_file: /tmp/torchelastic_abjm4rlg/none_llm9bmq2/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +[3]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam41-ib0 + rank : 31 (local_rank: 7) + exitcode : 1 (pid: 365744) + error_file: /tmp/torchelastic_abjm4rlg/none_llm9bmq2/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam41-ib0 + rank : 25 (local_rank: 1) + exitcode : 1 (pid: 365738) + error_file: /tmp/torchelastic_abjm4rlg/none_llm9bmq2/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +============================================================ +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 300.6498258113861 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam40-ib0 + rank : 19 (local_rank: 3) + exitcode : 1 (pid: 347103) + error_file: /tmp/torchelastic_nv4ko73p/none_l3ysb_bh/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +[2]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam40-ib0 + rank : 21 (local_rank: 5) + exitcode : 1 (pid: 347105) + error_file: /tmp/torchelastic_nv4ko73p/none_l3ysb_bh/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +[3]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam40-ib0 + rank : 23 (local_rank: 7) + exitcode : 1 (pid: 347107) + error_file: /tmp/torchelastic_nv4ko73p/none_l3ysb_bh/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam40-ib0 + rank : 17 (local_rank: 1) + exitcode : 1 (pid: 347101) + error_file: /tmp/torchelastic_nv4ko73p/none_l3ysb_bh/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +============================================================ +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 301.2758595943451 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam39-ib0 + rank : 11 (local_rank: 3) + exitcode : 1 (pid: 347150) + error_file: /tmp/torchelastic_1dia5id5/none_8a89c_a9/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +[2]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam39-ib0 + rank : 13 (local_rank: 5) + exitcode : 1 (pid: 347152) + error_file: /tmp/torchelastic_1dia5id5/none_8a89c_a9/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +[3]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam39-ib0 + rank : 15 (local_rank: 7) + exitcode : 1 (pid: 347154) + error_file: /tmp/torchelastic_1dia5id5/none_8a89c_a9/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:02:26 + host : jean-zay-iam39-ib0 + rank : 9 (local_rank: 1) + exitcode : 1 (pid: 347148) + error_file: /tmp/torchelastic_1dia5id5/none_8a89c_a9/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +============================================================ +srun: error: jean-zay-iam41: task 3: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2094507.0 +srun: error: jean-zay-iam40: task 2: Exited with exit code 1 +slurmstepd: error: *** STEP 2094507.0 ON jean-zay-iam38 CANCELLED AT 2022-10-07T09:07:34 *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 214983 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289736 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 201248 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 303310 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 214984 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 201249 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289737 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 303311 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 214985 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289738 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 201250 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 303312 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 214986 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289739 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 201251 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 303313 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 214987 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 201252 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 303314 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 214988 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289740 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 303315 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289741 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 201253 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 303316 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 214989 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289742 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 289743 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 201254 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 214990 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 303317 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 201255 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + self._exit_barrier() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 375605 got signal: 15 +srun: error: jean-zay-iam39: task 1: Exited with exit code 1 +srun: error: jean-zay-iam38: task 0: Exited with exit code 1 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 214942 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 289698 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 303272 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 201208 got signal: 15 +srun: error: jean-zay-iam52: task 7: Exited with exit code 1 +srun: error: jean-zay-iam42: task 4: Exited with exit code 1 +srun: error: jean-zay-iam43: task 5: Exited with exit code 1 +srun: error: jean-zay-iam51: task 6: Exited with exit code 1 +/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) + return torch._C._cuda_getDeviceCount() > 0 +/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) + return torch._C._cuda_getDeviceCount() > 0 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default3]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default3]: return torch._C._cuda_getDeviceCount() > 0 +[default5]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default5]: return torch._C._cuda_getDeviceCount() > 0 +[default6]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default6]: return torch._C._cuda_getDeviceCount() > 0 +[default4]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default4]: return torch._C._cuda_getDeviceCount() > 0 +[default0]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default0]: return torch._C._cuda_getDeviceCount() > 0 +[default2]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default2]: return torch._C._cuda_getDeviceCount() > 0 +[default1]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default1]: return torch._C._cuda_getDeviceCount() > 0 +[default4]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default4]: return torch._C._cuda_getDeviceCount() > 0 +[default7]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default7]: return torch._C._cuda_getDeviceCount() > 0 +[default3]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default3]: return torch._C._cuda_getDeviceCount() > 0 +[default6]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default6]: return torch._C._cuda_getDeviceCount() > 0 +[default7]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default7]: return torch._C._cuda_getDeviceCount() > 0 +[default1]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default1]: return torch._C._cuda_getDeviceCount() > 0 +[default0]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default0]: return torch._C._cuda_getDeviceCount() > 0 +[default2]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default2]: return torch._C._cuda_getDeviceCount() > 0 +[default5]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default5]: return torch._C._cuda_getDeviceCount() > 0 +[default1]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default0]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default3]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default2]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default4]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default6]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default7]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default3]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default4]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default6]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default7]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default0]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default0]:AssertionError: Megatron requires CUDA. +[default2]:Traceback (most recent call last): +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default5]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default5]:AssertionError: Megatron requires CUDA. +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]:Traceback (most recent call last): +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default1]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default1]:AssertionError: Megatron requires CUDA. +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default2]:AssertionError: Megatron requires CUDA. +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default3]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default3]:AssertionError: Megatron requires CUDA. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default6]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default6]:AssertionError: Megatron requires CUDA. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default4]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default4]:AssertionError: Megatron requires CUDA. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default7]:AssertionError: Megatron requires CUDA. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default3]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default3]:AssertionError: Megatron requires CUDA. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default4]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default4]:AssertionError: Megatron requires CUDA. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default6]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default6]:AssertionError: Megatron requires CUDA. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default7]:AssertionError: Megatron requires CUDA. +[default2]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default1]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default2]:Traceback (most recent call last): +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default1]:AssertionError: Megatron requires CUDA. +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default2]:AssertionError: Megatron requires CUDA. +[default0]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default5]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default5]:AssertionError: Megatron requires CUDA. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default0]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default0]:AssertionError: Megatron requires CUDA. +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 178136) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 184605) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +[default0]:using world size: 64, data-parallel-size: 16, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 16 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2094785.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 250 +[default0]: eval_iters ...................................... 5 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 8192 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 2048 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 128 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 64 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 128 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-07 09:16:03,483] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 308.869836807251 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:15:59 + host : jean-zay-iam48-ib0 + rank : 57 (local_rank: 1) + exitcode : 1 (pid: 178137) + error_file: /tmp/torchelastic_1mxcmhyf/none_29rwis5b/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[2]: + time : 2022-10-07_09:15:59 + host : jean-zay-iam48-ib0 + rank : 58 (local_rank: 2) + exitcode : 1 (pid: 178138) + error_file: /tmp/torchelastic_1mxcmhyf/none_29rwis5b/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[3]: + time : 2022-10-07_09:15:59 + host : jean-zay-iam48-ib0 + rank : 59 (local_rank: 3) + exitcode : 1 (pid: 178139) + error_file: /tmp/torchelastic_1mxcmhyf/none_29rwis5b/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[4]: + time : 2022-10-07_09:15:59 + host : jean-zay-iam48-ib0 + rank : 60 (local_rank: 4) + exitcode : 1 (pid: 178140) + error_file: /tmp/torchelastic_1mxcmhyf/none_29rwis5b/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[5]: + time : 2022-10-07_09:15:59 + host : jean-zay-iam48-ib0 + rank : 61 (local_rank: 5) + exitcode : 1 (pid: 178141) + error_file: /tmp/torchelastic_1mxcmhyf/none_29rwis5b/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[6]: + time : 2022-10-07_09:15:59 + host : jean-zay-iam48-ib0 + rank : 62 (local_rank: 6) + exitcode : 1 (pid: 178142) + error_file: /tmp/torchelastic_1mxcmhyf/none_29rwis5b/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[7]: + time : 2022-10-07_09:15:59 + host : jean-zay-iam48-ib0 + rank : 63 (local_rank: 7) + exitcode : 1 (pid: 178143) + error_file: /tmp/torchelastic_1mxcmhyf/none_29rwis5b/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:15:59 + host : jean-zay-iam48-ib0 + rank : 56 (local_rank: 0) + exitcode : 1 (pid: 178136) + error_file: /tmp/torchelastic_1mxcmhyf/none_29rwis5b/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +============================================================ +srun: error: jean-zay-iam48: task 7: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2094785.0 +slurmstepd: error: *** STEP 2094785.0 ON jean-zay-iam38 CANCELLED AT 2022-10-07T09:21:11 *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 378692 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 378693 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 378694 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 378695 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 306137 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 378696 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 306138 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349903 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349951 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 378697 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 368546 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 292554 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 378698 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349952 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 378699 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 306139 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 306140 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349904 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 306141 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349953 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349905 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 368547 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 292555 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349906 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 368548 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349954 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 292556 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349907 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 368549 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 368550 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349955 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 292557 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 368551 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349908 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 292558 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349956 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 292559 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 368552 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349909 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349957 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349910 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 306142 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 292560 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 349958 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 306143 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 292561 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 306144 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 368553 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + self._exit_barrier() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 184586 got signal: 15 +srun: error: jean-zay-iam47: task 6: Exited with exit code 1 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 349865 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 292516 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 368508 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 378653 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 306099 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 349913 got signal: 15 +srun: error: jean-zay-iam42: task 4: Exited with exit code 1 +srun: error: jean-zay-iam41: task 3: Exited with exit code 1 +srun: error: jean-zay-iam40: task 2: Exited with exit code 1 +srun: error: jean-zay-iam43: task 5: Exited with exit code 1 +srun: error: jean-zay-iam38: task 0: Exited with exit code 1 +srun: error: jean-zay-iam39: task 1: Exited with exit code 1 +/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) + return torch._C._cuda_getDeviceCount() > 0 +/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) + return torch._C._cuda_getDeviceCount() > 0 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default0]: return torch._C._cuda_getDeviceCount() > 0 +[default2]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default2]: return torch._C._cuda_getDeviceCount() > 0 +[default3]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default3]: return torch._C._cuda_getDeviceCount() > 0 +[default4]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default4]: return torch._C._cuda_getDeviceCount() > 0 +[default6]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default6]: return torch._C._cuda_getDeviceCount() > 0 +[default7]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default7]: return torch._C._cuda_getDeviceCount() > 0 +[default7]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default7]: return torch._C._cuda_getDeviceCount() > 0 +[default1]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default1]: return torch._C._cuda_getDeviceCount() > 0 +[default5]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default5]: return torch._C._cuda_getDeviceCount() > 0 +[default0]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default0]: return torch._C._cuda_getDeviceCount() > 0 +[default2]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default2]: return torch._C._cuda_getDeviceCount() > 0 +[default3]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default3]: return torch._C._cuda_getDeviceCount() > 0 +[default4]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default4]: return torch._C._cuda_getDeviceCount() > 0 +[default1]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default1]: return torch._C._cuda_getDeviceCount() > 0 +[default5]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default5]: return torch._C._cuda_getDeviceCount() > 0 +[default6]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default6]: return torch._C._cuda_getDeviceCount() > 0 +[default0]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default2]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default3]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default4]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default6]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default7]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default2]: pretrain( +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default0]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default6]: return f(*args, **kwargs) +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default6]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default4]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default4]:AssertionError: Megatron requires CUDA. +[default2]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default0]:AssertionError: Megatron requires CUDA. +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default3]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default3]:AssertionError: Megatron requires CUDA. +[default6]:AssertionError: Megatron requires CUDA. +[default2]:AssertionError: Megatron requires CUDA. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default7]:AssertionError: Megatron requires CUDA. +[default1]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default3]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default2]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default4]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default7]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default6]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default0]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default0]:AssertionError: Megatron requires CUDA. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default4]: pretrain( +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default4]:AssertionError: Megatron requires CUDA. +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default2]:AssertionError: Megatron requires CUDA. +[default3]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default3]:AssertionError: Megatron requires CUDA. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default1]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default1]:AssertionError: Megatron requires CUDA. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default6]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default6]:AssertionError: Megatron requires CUDA. +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default5]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default5]:AssertionError: Megatron requires CUDA. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default7]:AssertionError: Megatron requires CUDA. +[default1]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default1]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default1]:AssertionError: Megatron requires CUDA. +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 188, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default5]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default5]:AssertionError: Megatron requires CUDA. +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 178808) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 185267) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +[default1]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default1]: [--hidden-size HIDDEN_SIZE] +[default1]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default1]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default1]: [--kv-channels KV_CHANNELS] +[default1]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default1]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default1]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default1]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default1]: [--sync-tp-duplicated-parameters] +[default1]: [--apply-residual-connection-post-layernorm] +[default1]: [--embed-layernorm] [--openai-gelu] +[default1]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default1]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default1]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default1]: [--kill-switch-path KILL_SWITCH_PATH] +[default1]: [--log-level {debug,info,warning,error,critical}] +[default1]: [--log-level-replica {debug,info,warning,error,critical}] +[default1]: [--attention-dropout ATTENTION_DROPOUT] +[default1]: [--hidden-dropout HIDDEN_DROPOUT] +[default1]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default1]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default1]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default1]: [--micro-batch-size MICRO_BATCH_SIZE] +[default1]: [--batch-size BATCH_SIZE] +[default1]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default1]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default1]: [--checkpoint-activations] +[default1]: [--distribute-checkpointed-activations] +[default1]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default1]: [--train-iters TRAIN_ITERS] +[default1]: [--train-samples TRAIN_SAMPLES] +[default1]: [--train-tokens TRAIN_TOKENS] +[default1]: [--log-interval LOG_INTERVAL] +[default1]: [--exit-interval EXIT_INTERVAL] +[default1]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default1]: [--tensorboard-dir TENSORBOARD_DIR] +[default1]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default1]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default1]: [--use-bnb-optimizer] +[default1]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default1]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default1]: [--eval-only EVAL_ONLY] +[default1]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default1]: [--inference] +[default1]: [--abort-on-unmet-fused-kernel-constraints] +[default1]: [--pp-partition-method PP_PARTITION_METHOD] +[default1]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default1]: [--init-method-xavier-uniform] [--lr LR] +[default1]: [--lr-decay-style {constant,linear,cosine}] +[default1]: [--lr-decay-iters LR_DECAY_ITERS] +[default1]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default1]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default1]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default1]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default1]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default1]: [--warmup WARMUP] [--min-lr MIN_LR] +[default1]: [--override-lr-scheduler] +[default1]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default1]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default1]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default1]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default1]: [--loss-scale LOSS_SCALE] +[default1]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default1]: [--min-loss-scale MIN_LOSS_SCALE] +[default1]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default1]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default1]: [--no-query-key-layer-scaling] +[default1]: [--attention-softmax-in-fp32] +[default1]: [--accumulate-allreduce-grads-in-fp32] +[default1]: [--fp16-lm-cross-entropy] +[default1]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default1]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default1]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default1]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default1]: [--distributed-backend {nccl,gloo}] +[default1]: [--DDP-impl {local,torch}] +[default1]: [--use-contiguous-buffers-in-ddp] +[default1]: [--no-scatter-gather-tensors-in-pipeline] +[default1]: [--local_rank LOCAL_RANK] +[default1]: [--lazy-mpu-init LAZY_MPU_INIT] +[default1]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default1]: [--eval-interval EVAL_INTERVAL] +[default1]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default1]: [--split SPLIT] +[default1]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default1]: [--merge-file MERGE_FILE] +[default1]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default1]: [--seq-length SEQ_LENGTH] +[default1]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default1]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default1]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default1]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default1]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default1]: [--num-workers NUM_WORKERS] +[default1]: [--valid-num-workers VALID_NUM_WORKERS] +[default1]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default1]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default1]: [--data-impl {lazy,cached,mmap,infer}] +[default1]: [--reset-position-ids] [--reset-attention-mask] +[default1]: [--eod-mask-loss] [--loss-on-targets-only] +[default1]: [--norm-target-loss] +[default1]: [--reweight-loss-based-on-position-frequency] +[default1]: [--noise-density NOISE_DENSITY] +[default1]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default1]: [--prefixlm] [--adlr-autoresume] +[default1]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default1]: [--ict-head-size ICT_HEAD_SIZE] +[default1]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default1]: [--biencoder-shared-query-context-model] +[default1]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default1]: [--titles-data-path TITLES_DATA_PATH] +[default1]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default1]: [--use-one-sent-docs] +[default1]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default1]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default1]: [--retriever-score-scaling] +[default1]: [--block-data-path BLOCK_DATA_PATH] +[default1]: [--embedding-path EMBEDDING_PATH] +[default1]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default1]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default1]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default1]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default1]: [--log-params-norm] [--log-num-zeros-in-grad] +[default1]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default1]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default1]: [--log-timers-to-tensorboard] +[default1]: [--log-batch-size-to-tensorboard] +[default1]: [--no-log-learnig-rate-to-tensorboard] +[default1]: [--no-log-loss-scale-to-tensorboard] +[default1]: [--log-validation-ppl-to-tensorboard] +[default1]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default1]: [--zero-contigious-gradients] +[default1]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default1]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default1]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default1]: [--scattered-embeddings] [--split-transformers] +[default1]: [--memory-centric-tiled-linear] +[default1]: [--tile-factor TILE_FACTOR] +[default1]: [--deepspeed-activation-checkpointing] +[default1]: [--partition-activations] [--contigious-checkpointing] +[default1]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default1]: [--profile-backward] [--deepspeed] +[default1]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default1]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default1]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default4]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default4]: [--hidden-size HIDDEN_SIZE] +[default4]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default4]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default4]: [--kv-channels KV_CHANNELS] +[default4]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default4]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default4]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default4]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default4]: [--sync-tp-duplicated-parameters] +[default4]: [--apply-residual-connection-post-layernorm] +[default4]: [--embed-layernorm] [--openai-gelu] +[default4]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default4]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default4]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default4]: [--kill-switch-path KILL_SWITCH_PATH] +[default4]: [--log-level {debug,info,warning,error,critical}] +[default4]: [--log-level-replica {debug,info,warning,error,critical}] +[default4]: [--attention-dropout ATTENTION_DROPOUT] +[default4]: [--hidden-dropout HIDDEN_DROPOUT] +[default4]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default4]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default4]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default4]: [--micro-batch-size MICRO_BATCH_SIZE] +[default4]: [--batch-size BATCH_SIZE] +[default4]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default4]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default4]: [--checkpoint-activations] +[default4]: [--distribute-checkpointed-activations] +[default4]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default4]: [--train-iters TRAIN_ITERS] +[default4]: [--train-samples TRAIN_SAMPLES] +[default4]: [--train-tokens TRAIN_TOKENS] +[default4]: [--log-interval LOG_INTERVAL] +[default4]: [--exit-interval EXIT_INTERVAL] +[default4]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default4]: [--tensorboard-dir TENSORBOARD_DIR] +[default4]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default4]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default4]: [--use-bnb-optimizer] +[default4]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default4]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default4]: [--eval-only EVAL_ONLY] +[default4]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default4]: [--inference] +[default4]: [--abort-on-unmet-fused-kernel-constraints] +[default4]: [--pp-partition-method PP_PARTITION_METHOD] +[default4]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default4]: [--init-method-xavier-uniform] [--lr LR] +[default4]: [--lr-decay-style {constant,linear,cosine}] +[default4]: [--lr-decay-iters LR_DECAY_ITERS] +[default4]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default4]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default4]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default4]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default4]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default4]: [--warmup WARMUP] [--min-lr MIN_LR] +[default4]: [--override-lr-scheduler] +[default4]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default4]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default4]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default4]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default4]: [--loss-scale LOSS_SCALE] +[default4]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default4]: [--min-loss-scale MIN_LOSS_SCALE] +[default4]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default4]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default4]: [--no-query-key-layer-scaling] +[default4]: [--attention-softmax-in-fp32] +[default4]: [--accumulate-allreduce-grads-in-fp32] +[default4]: [--fp16-lm-cross-entropy] +[default4]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default4]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default4]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default4]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default4]: [--distributed-backend {nccl,gloo}] +[default4]: [--DDP-impl {local,torch}] +[default4]: [--use-contiguous-buffers-in-ddp] +[default4]: [--no-scatter-gather-tensors-in-pipeline] +[default4]: [--local_rank LOCAL_RANK] +[default4]: [--lazy-mpu-init LAZY_MPU_INIT] +[default4]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default4]: [--eval-interval EVAL_INTERVAL] +[default4]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default4]: [--split SPLIT] +[default4]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default4]: [--merge-file MERGE_FILE] +[default4]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default4]: [--seq-length SEQ_LENGTH] +[default4]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default4]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default4]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default4]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default4]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default4]: [--num-workers NUM_WORKERS] +[default4]: [--valid-num-workers VALID_NUM_WORKERS] +[default4]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default4]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default4]: [--data-impl {lazy,cached,mmap,infer}] +[default4]: [--reset-position-ids] [--reset-attention-mask] +[default4]: [--eod-mask-loss] [--loss-on-targets-only] +[default4]: [--norm-target-loss] +[default4]: [--reweight-loss-based-on-position-frequency] +[default4]: [--noise-density NOISE_DENSITY] +[default4]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default4]: [--prefixlm] [--adlr-autoresume] +[default4]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default4]: [--ict-head-size ICT_HEAD_SIZE] +[default4]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default4]: [--biencoder-shared-query-context-model] +[default4]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default4]: [--titles-data-path TITLES_DATA_PATH] +[default4]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default4]: [--use-one-sent-docs] +[default4]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default4]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default4]: [--retriever-score-scaling] +[default4]: [--block-data-path BLOCK_DATA_PATH] +[default4]: [--embedding-path EMBEDDING_PATH] +[default4]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default4]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default4]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default4]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default4]: [--log-params-norm] [--log-num-zeros-in-grad] +[default4]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default4]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default4]: [--log-timers-to-tensorboard] +[default4]: [--log-batch-size-to-tensorboard] +[default4]: [--no-log-learnig-rate-to-tensorboard] +[default4]: [--no-log-loss-scale-to-tensorboard] +[default4]: [--log-validation-ppl-to-tensorboard] +[default4]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default4]: [--zero-contigious-gradients] +[default4]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default4]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default4]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default4]: [--scattered-embeddings] [--split-transformers] +[default4]: [--memory-centric-tiled-linear] +[default4]: [--tile-factor TILE_FACTOR] +[default4]: [--deepspeed-activation-checkpointing] +[default4]: [--partition-activations] [--contigious-checkpointing] +[default4]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default4]: [--profile-backward] [--deepspeed] +[default4]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default4]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default4]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default3]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default3]: [--hidden-size HIDDEN_SIZE] +[default3]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default3]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default3]: [--kv-channels KV_CHANNELS] +[default3]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default3]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default3]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default3]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default3]: [--sync-tp-duplicated-parameters] +[default3]: [--apply-residual-connection-post-layernorm] +[default3]: [--embed-layernorm] [--openai-gelu] +[default3]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default3]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default3]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default3]: [--kill-switch-path KILL_SWITCH_PATH] +[default3]: [--log-level {debug,info,warning,error,critical}] +[default3]: [--log-level-replica {debug,info,warning,error,critical}] +[default3]: [--attention-dropout ATTENTION_DROPOUT] +[default3]: [--hidden-dropout HIDDEN_DROPOUT] +[default3]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default3]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default3]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default3]: [--micro-batch-size MICRO_BATCH_SIZE] +[default3]: [--batch-size BATCH_SIZE] +[default3]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default3]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default3]: [--checkpoint-activations] +[default3]: [--distribute-checkpointed-activations] +[default3]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default3]: [--train-iters TRAIN_ITERS] +[default3]: [--train-samples TRAIN_SAMPLES] +[default3]: [--train-tokens TRAIN_TOKENS] +[default3]: [--log-interval LOG_INTERVAL] +[default3]: [--exit-interval EXIT_INTERVAL] +[default3]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default3]: [--tensorboard-dir TENSORBOARD_DIR] +[default3]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default3]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default3]: [--use-bnb-optimizer] +[default3]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default3]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default3]: [--eval-only EVAL_ONLY] +[default3]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default3]: [--inference] +[default3]: [--abort-on-unmet-fused-kernel-constraints] +[default3]: [--pp-partition-method PP_PARTITION_METHOD] +[default3]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default3]: [--init-method-xavier-uniform] [--lr LR] +[default3]: [--lr-decay-style {constant,linear,cosine}] +[default3]: [--lr-decay-iters LR_DECAY_ITERS] +[default3]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default3]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default3]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default3]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default3]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default3]: [--warmup WARMUP] [--min-lr MIN_LR] +[default3]: [--override-lr-scheduler] +[default3]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default3]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default3]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default3]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default3]: [--loss-scale LOSS_SCALE] +[default3]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default3]: [--min-loss-scale MIN_LOSS_SCALE] +[default3]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default3]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default3]: [--no-query-key-layer-scaling] +[default3]: [--attention-softmax-in-fp32] +[default3]: [--accumulate-allreduce-grads-in-fp32] +[default3]: [--fp16-lm-cross-entropy] +[default3]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default3]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default3]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default3]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default3]: [--distributed-backend {nccl,gloo}] +[default3]: [--DDP-impl {local,torch}] +[default3]: [--use-contiguous-buffers-in-ddp] +[default3]: [--no-scatter-gather-tensors-in-pipeline] +[default3]: [--local_rank LOCAL_RANK] +[default3]: [--lazy-mpu-init LAZY_MPU_INIT] +[default3]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default3]: [--eval-interval EVAL_INTERVAL] +[default3]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default3]: [--split SPLIT] +[default3]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default3]: [--merge-file MERGE_FILE] +[default3]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default3]: [--seq-length SEQ_LENGTH] +[default3]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default3]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default3]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default3]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default3]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default3]: [--num-workers NUM_WORKERS] +[default3]: [--valid-num-workers VALID_NUM_WORKERS] +[default3]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default3]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default3]: [--data-impl {lazy,cached,mmap,infer}] +[default3]: [--reset-position-ids] [--reset-attention-mask] +[default3]: [--eod-mask-loss] [--loss-on-targets-only] +[default3]: [--norm-target-loss] +[default3]: [--reweight-loss-based-on-position-frequency] +[default3]: [--noise-density NOISE_DENSITY] +[default3]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default3]: [--prefixlm] [--adlr-autoresume] +[default3]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default3]: [--ict-head-size ICT_HEAD_SIZE] +[default3]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default0]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default0]: [--hidden-size HIDDEN_SIZE] +[default0]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default3]: [--biencoder-shared-query-context-model] +[default0]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default0]: [--kv-channels KV_CHANNELS] +[default3]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default0]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default0]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default3]: [--titles-data-path TITLES_DATA_PATH] +[default0]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default0]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default0]: [--sync-tp-duplicated-parameters] +[default0]: [--apply-residual-connection-post-layernorm] +[default3]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default3]: [--use-one-sent-docs] +[default2]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default2]: [--hidden-size HIDDEN_SIZE] +[default2]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default2]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default2]: [--kv-channels KV_CHANNELS] +[default2]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default2]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default2]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default2]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default2]: [--sync-tp-duplicated-parameters] +[default2]: [--apply-residual-connection-post-layernorm] +[default2]: [--embed-layernorm] [--openai-gelu] +[default2]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default2]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default2]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default2]: [--kill-switch-path KILL_SWITCH_PATH] +[default2]: [--log-level {debug,info,warning,error,critical}] +[default2]: [--log-level-replica {debug,info,warning,error,critical}] +[default2]: [--attention-dropout ATTENTION_DROPOUT] +[default2]: [--hidden-dropout HIDDEN_DROPOUT] +[default2]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default2]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default2]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default2]: [--micro-batch-size MICRO_BATCH_SIZE] +[default2]: [--batch-size BATCH_SIZE] +[default2]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default2]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default2]: [--checkpoint-activations] +[default2]: [--distribute-checkpointed-activations] +[default2]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default2]: [--train-iters TRAIN_ITERS] +[default2]: [--train-samples TRAIN_SAMPLES] +[default2]: [--train-tokens TRAIN_TOKENS] +[default2]: [--log-interval LOG_INTERVAL] +[default2]: [--exit-interval EXIT_INTERVAL] +[default2]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default2]: [--tensorboard-dir TENSORBOARD_DIR] +[default2]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default2]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default2]: [--use-bnb-optimizer] +[default2]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default2]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default2]: [--eval-only EVAL_ONLY] +[default2]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default2]: [--inference] +[default2]: [--abort-on-unmet-fused-kernel-constraints] +[default2]: [--pp-partition-method PP_PARTITION_METHOD] +[default2]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default2]: [--init-method-xavier-uniform] [--lr LR] +[default2]: [--lr-decay-style {constant,linear,cosine}] +[default2]: [--lr-decay-iters LR_DECAY_ITERS] +[default2]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default2]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default2]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default2]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default2]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default2]: [--warmup WARMUP] [--min-lr MIN_LR] +[default0]: [--embed-layernorm] [--openai-gelu] +[default2]: [--override-lr-scheduler] +[default0]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default0]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default0]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default0]: [--kill-switch-path KILL_SWITCH_PATH] +[default2]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default0]: [--log-level {debug,info,warning,error,critical}] +[default0]: [--log-level-replica {debug,info,warning,error,critical}] +[default0]: [--attention-dropout ATTENTION_DROPOUT] +[default2]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default2]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default3]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default3]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default3]: [--retriever-score-scaling] +[default3]: [--block-data-path BLOCK_DATA_PATH] +[default3]: [--embedding-path EMBEDDING_PATH] +[default3]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default3]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default3]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default3]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default3]: [--log-params-norm] [--log-num-zeros-in-grad] +[default3]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default3]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default3]: [--log-timers-to-tensorboard] +[default3]: [--log-batch-size-to-tensorboard] +[default3]: [--no-log-learnig-rate-to-tensorboard] +[default3]: [--no-log-loss-scale-to-tensorboard] +[default0]: [--hidden-dropout HIDDEN_DROPOUT] +[default0]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default0]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default0]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default0]: [--micro-batch-size MICRO_BATCH_SIZE] +[default0]: [--batch-size BATCH_SIZE] +[default0]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default0]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default0]: [--checkpoint-activations] +[default0]: [--distribute-checkpointed-activations] +[default0]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default0]: [--train-iters TRAIN_ITERS] +[default0]: [--train-samples TRAIN_SAMPLES] +[default0]: [--train-tokens TRAIN_TOKENS] +[default0]: [--log-interval LOG_INTERVAL] +[default0]: [--exit-interval EXIT_INTERVAL] +[default0]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default0]: [--tensorboard-dir TENSORBOARD_DIR] +[default0]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default3]: [--log-validation-ppl-to-tensorboard] +[default3]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default3]: [--zero-contigious-gradients] +[default3]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default3]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default3]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default3]: [--scattered-embeddings] [--split-transformers] +[default3]: [--memory-centric-tiled-linear] +[default3]: [--tile-factor TILE_FACTOR] +[default3]: [--deepspeed-activation-checkpointing] +[default3]: [--partition-activations] [--contigious-checkpointing] +[default2]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default2]: [--loss-scale LOSS_SCALE] +[default2]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default3]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default3]: [--profile-backward] [--deepspeed] +[default3]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default3]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default2]: [--min-loss-scale MIN_LOSS_SCALE] +[default2]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default2]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default3]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default2]: [--no-query-key-layer-scaling] +[default2]: [--attention-softmax-in-fp32] +[default0]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default2]: [--accumulate-allreduce-grads-in-fp32] +[default0]: [--use-bnb-optimizer] +[default0]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default0]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default0]: [--eval-only EVAL_ONLY] +[default0]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default0]: [--inference] +[default2]: [--fp16-lm-cross-entropy] +[default2]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default2]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default2]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default2]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default2]: [--distributed-backend {nccl,gloo}] +[default2]: [--DDP-impl {local,torch}] +[default2]: [--use-contiguous-buffers-in-ddp] +[default2]: [--no-scatter-gather-tensors-in-pipeline] +[default2]: [--local_rank LOCAL_RANK] +[default2]: [--lazy-mpu-init LAZY_MPU_INIT] +[default2]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default2]: [--eval-interval EVAL_INTERVAL] +[default0]: [--abort-on-unmet-fused-kernel-constraints] +[default0]: [--pp-partition-method PP_PARTITION_METHOD] +[default0]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default0]: [--init-method-xavier-uniform] [--lr LR] +[default2]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default2]: [--split SPLIT] +[default2]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--lr-decay-style {constant,linear,cosine}] +[default0]: [--lr-decay-iters LR_DECAY_ITERS] +[default0]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default0]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default0]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default0]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default0]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default2]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--warmup WARMUP] [--min-lr MIN_LR] +[default0]: [--override-lr-scheduler] +[default0]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default2]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default0]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default0]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default0]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default0]: [--loss-scale LOSS_SCALE] +[default0]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default0]: [--min-loss-scale MIN_LOSS_SCALE] +[default0]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default2]: [--merge-file MERGE_FILE] +[default2]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default0]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default0]: [--no-query-key-layer-scaling] +[default0]: [--attention-softmax-in-fp32] +[default2]: [--seq-length SEQ_LENGTH] +[default2]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default0]: [--accumulate-allreduce-grads-in-fp32] +[default0]: [--fp16-lm-cross-entropy] +[default0]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default2]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default2]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default0]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default0]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default0]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default2]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default2]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default0]: [--distributed-backend {nccl,gloo}] +[default0]: [--DDP-impl {local,torch}] +[default2]: [--num-workers NUM_WORKERS] +[default2]: [--valid-num-workers VALID_NUM_WORKERS] +[default0]: [--use-contiguous-buffers-in-ddp] +[default0]: [--no-scatter-gather-tensors-in-pipeline] +[default0]: [--local_rank LOCAL_RANK] +[default2]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default2]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default0]: [--lazy-mpu-init LAZY_MPU_INIT] +[default0]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default0]: [--eval-interval EVAL_INTERVAL] +[default2]: [--data-impl {lazy,cached,mmap,infer}] +[default2]: [--reset-position-ids] [--reset-attention-mask] +[default2]: [--eod-mask-loss] [--loss-on-targets-only] +[default2]: [--norm-target-loss] +[default0]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default0]: [--split SPLIT] +[default2]: [--reweight-loss-based-on-position-frequency] +[default2]: [--noise-density NOISE_DENSITY] +[default2]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default0]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--prefixlm] [--adlr-autoresume] +[default2]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default2]: [--ict-head-size ICT_HEAD_SIZE] +[default2]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default0]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--biencoder-shared-query-context-model] +[default2]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default0]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default0]: [--merge-file MERGE_FILE] +[default0]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default2]: [--titles-data-path TITLES_DATA_PATH] +[default2]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default0]: [--seq-length SEQ_LENGTH] +[default0]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default0]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default2]: [--use-one-sent-docs] +[default2]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default0]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default0]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default0]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default2]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default2]: [--retriever-score-scaling] +[default0]: [--num-workers NUM_WORKERS] +[default0]: [--valid-num-workers VALID_NUM_WORKERS] +[default0]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default0]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default0]: [--data-impl {lazy,cached,mmap,infer}] +[default0]: [--reset-position-ids] [--reset-attention-mask] +[default0]: [--eod-mask-loss] [--loss-on-targets-only] +[default0]: [--norm-target-loss] +[default2]: [--block-data-path BLOCK_DATA_PATH] +[default2]: [--embedding-path EMBEDDING_PATH] +[default0]: [--reweight-loss-based-on-position-frequency] +[default0]: [--noise-density NOISE_DENSITY] +[default0]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default0]: [--prefixlm] [--adlr-autoresume] +[default2]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default2]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default2]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default0]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default0]: [--ict-head-size ICT_HEAD_SIZE] +[default0]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default2]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default2]: [--log-params-norm] [--log-num-zeros-in-grad] +[default2]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default0]: [--biencoder-shared-query-context-model] +[default0]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default0]: [--titles-data-path TITLES_DATA_PATH] +[default2]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default2]: [--log-timers-to-tensorboard] +[default0]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default0]: [--use-one-sent-docs] +[default0]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default2]: [--log-batch-size-to-tensorboard] +[default2]: [--no-log-learnig-rate-to-tensorboard] +[default2]: [--no-log-loss-scale-to-tensorboard] +[default2]: [--log-validation-ppl-to-tensorboard] +[default0]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default0]: [--retriever-score-scaling] +[default0]: [--block-data-path BLOCK_DATA_PATH] +[default0]: [--embedding-path EMBEDDING_PATH] +[default2]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default2]: [--zero-contigious-gradients] +[default0]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default0]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default2]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default2]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default2]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default0]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default0]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default0]: [--log-params-norm] [--log-num-zeros-in-grad] +[default0]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default2]: [--scattered-embeddings] [--split-transformers] +[default2]: [--memory-centric-tiled-linear] +[default0]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default0]: [--log-timers-to-tensorboard] +[default0]: [--log-batch-size-to-tensorboard] +[default2]: [--tile-factor TILE_FACTOR] +[default2]: [--deepspeed-activation-checkpointing] +[default2]: [--partition-activations] [--contigious-checkpointing] +[default2]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default2]: [--profile-backward] [--deepspeed] +[default0]: [--no-log-learnig-rate-to-tensorboard] +[default0]: [--no-log-loss-scale-to-tensorboard] +[default0]: [--log-validation-ppl-to-tensorboard] +[default2]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default2]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default2]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default0]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default0]: [--zero-contigious-gradients] +[default0]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default0]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default0]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default0]: [--scattered-embeddings] [--split-transformers] +[default0]: [--memory-centric-tiled-linear] +[default0]: [--tile-factor TILE_FACTOR] +[default0]: [--deepspeed-activation-checkpointing] +[default0]: [--partition-activations] [--contigious-checkpointing] +[default0]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default0]: [--profile-backward] [--deepspeed] +[default0]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default0]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default0]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default7]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default7]: [--hidden-size HIDDEN_SIZE] +[default7]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default7]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default7]: [--kv-channels KV_CHANNELS] +[default5]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default5]: [--hidden-size HIDDEN_SIZE] +[default5]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default6]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default6]: [--hidden-size HIDDEN_SIZE] +[default6]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default6]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default7]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default7]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default7]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default7]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default5]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default5]: [--kv-channels KV_CHANNELS] +[default7]: [--sync-tp-duplicated-parameters] +[default7]: [--apply-residual-connection-post-layernorm] +[default7]: [--embed-layernorm] [--openai-gelu] +[default6]: [--kv-channels KV_CHANNELS] +[default6]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default6]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default7]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default7]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default5]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default5]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default7]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default7]: [--kill-switch-path KILL_SWITCH_PATH] +[default6]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default6]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default6]: [--sync-tp-duplicated-parameters] +[default7]: [--log-level {debug,info,warning,error,critical}] +[default7]: [--log-level-replica {debug,info,warning,error,critical}] +[default5]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default5]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default5]: [--sync-tp-duplicated-parameters] +[default7]: [--attention-dropout ATTENTION_DROPOUT] +[default7]: [--hidden-dropout HIDDEN_DROPOUT] +[default6]: [--apply-residual-connection-post-layernorm] +[default6]: [--embed-layernorm] [--openai-gelu] +[default6]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default6]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default6]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default6]: [--kill-switch-path KILL_SWITCH_PATH] +[default7]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default7]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default5]: [--apply-residual-connection-post-layernorm] +[default5]: [--embed-layernorm] [--openai-gelu] +[default7]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default7]: [--micro-batch-size MICRO_BATCH_SIZE] +[default7]: [--batch-size BATCH_SIZE] +[default7]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default5]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default5]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default7]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default7]: [--checkpoint-activations] +[default7]: [--distribute-checkpointed-activations] +[default7]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default5]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default5]: [--kill-switch-path KILL_SWITCH_PATH] +[default5]: [--log-level {debug,info,warning,error,critical}] +[default6]: [--log-level {debug,info,warning,error,critical}] +[default5]: [--log-level-replica {debug,info,warning,error,critical}] +[default6]: [--log-level-replica {debug,info,warning,error,critical}] +[default7]: [--train-iters TRAIN_ITERS] +[default7]: [--train-samples TRAIN_SAMPLES] +[default6]: [--attention-dropout ATTENTION_DROPOUT] +[default6]: [--hidden-dropout HIDDEN_DROPOUT] +[default6]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default7]: [--train-tokens TRAIN_TOKENS] +[default7]: [--log-interval LOG_INTERVAL] +[default5]: [--attention-dropout ATTENTION_DROPOUT] +[default5]: [--hidden-dropout HIDDEN_DROPOUT] +[default7]: [--exit-interval EXIT_INTERVAL] +[default6]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default6]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default6]: [--micro-batch-size MICRO_BATCH_SIZE] +[default6]: [--batch-size BATCH_SIZE] +[default5]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default5]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default5]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default7]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default7]: [--tensorboard-dir TENSORBOARD_DIR] +[default5]: [--micro-batch-size MICRO_BATCH_SIZE] +[default5]: [--batch-size BATCH_SIZE] +[default7]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default7]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default5]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default5]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default6]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default6]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default7]: [--use-bnb-optimizer] +[default7]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default5]: [--checkpoint-activations] +[default5]: [--distribute-checkpointed-activations] +[default6]: [--checkpoint-activations] +[default7]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default7]: [--eval-only EVAL_ONLY] +[default7]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default7]: [--inference] +[default7]: [--abort-on-unmet-fused-kernel-constraints] +[default6]: [--distribute-checkpointed-activations] +[default6]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default6]: [--train-iters TRAIN_ITERS] +[default7]: [--pp-partition-method PP_PARTITION_METHOD] +[default7]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default5]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default5]: [--train-iters TRAIN_ITERS] +[default6]: [--train-samples TRAIN_SAMPLES] +[default6]: [--train-tokens TRAIN_TOKENS] +[default6]: [--log-interval LOG_INTERVAL] +[default5]: [--train-samples TRAIN_SAMPLES] +[default5]: [--train-tokens TRAIN_TOKENS] +[default7]: [--init-method-xavier-uniform] [--lr LR] +[default7]: [--lr-decay-style {constant,linear,cosine}] +[default6]: [--exit-interval EXIT_INTERVAL] +[default6]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default7]: [--lr-decay-iters LR_DECAY_ITERS] +[default7]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default7]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default7]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default5]: [--log-interval LOG_INTERVAL] +[default7]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default7]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default5]: [--exit-interval EXIT_INTERVAL] +[default5]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default7]: [--warmup WARMUP] [--min-lr MIN_LR] +[default5]: [--tensorboard-dir TENSORBOARD_DIR] +[default5]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default5]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default6]: [--tensorboard-dir TENSORBOARD_DIR] +[default5]: [--use-bnb-optimizer] +[default6]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default6]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default6]: [--use-bnb-optimizer] +[default6]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default6]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default6]: [--eval-only EVAL_ONLY] +[default6]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default6]: [--inference] +[default6]: [--abort-on-unmet-fused-kernel-constraints] +[default6]: [--pp-partition-method PP_PARTITION_METHOD] +[default7]: [--override-lr-scheduler] +[default7]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default5]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default5]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default7]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default7]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default6]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default6]: [--init-method-xavier-uniform] [--lr LR] +[default5]: [--eval-only EVAL_ONLY] +[default5]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default6]: [--lr-decay-style {constant,linear,cosine}] +[default6]: [--lr-decay-iters LR_DECAY_ITERS] +[default5]: [--inference] +[default5]: [--abort-on-unmet-fused-kernel-constraints] +[default5]: [--pp-partition-method PP_PARTITION_METHOD] +[default6]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default5]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default6]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default7]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default7]: [--loss-scale LOSS_SCALE] +[default7]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default7]: [--min-loss-scale MIN_LOSS_SCALE] +[default6]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default6]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default6]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default7]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default7]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default5]: [--init-method-xavier-uniform] [--lr LR] +[default5]: [--lr-decay-style {constant,linear,cosine}] +[default6]: [--warmup WARMUP] [--min-lr MIN_LR] +[default6]: [--override-lr-scheduler] +[default5]: [--lr-decay-iters LR_DECAY_ITERS] +[default5]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default5]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default5]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default7]: [--no-query-key-layer-scaling] +[default7]: [--attention-softmax-in-fp32] +[default6]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default6]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default6]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default7]: [--accumulate-allreduce-grads-in-fp32] +[default7]: [--fp16-lm-cross-entropy] +[default5]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default5]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default7]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default7]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default5]: [--warmup WARMUP] [--min-lr MIN_LR] +[default5]: [--override-lr-scheduler] +[default5]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default6]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default6]: [--loss-scale LOSS_SCALE] +[default7]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default7]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default5]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default5]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default7]: [--distributed-backend {nccl,gloo}] +[default7]: [--DDP-impl {local,torch}] +[default7]: [--use-contiguous-buffers-in-ddp] +[default5]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default5]: [--loss-scale LOSS_SCALE] +[default5]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default7]: [--no-scatter-gather-tensors-in-pipeline] +[default7]: [--local_rank LOCAL_RANK] +[default6]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default6]: [--min-loss-scale MIN_LOSS_SCALE] +[default7]: [--lazy-mpu-init LAZY_MPU_INIT] +[default6]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default6]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default6]: [--no-query-key-layer-scaling] +[default7]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default7]: [--eval-interval EVAL_INTERVAL] +[default5]: [--min-loss-scale MIN_LOSS_SCALE] +[default5]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default5]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default7]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default5]: [--no-query-key-layer-scaling] +[default6]: [--attention-softmax-in-fp32] +[default6]: [--accumulate-allreduce-grads-in-fp32] +[default5]: [--attention-softmax-in-fp32] +[default5]: [--accumulate-allreduce-grads-in-fp32] +[default5]: [--fp16-lm-cross-entropy] +[default7]: [--split SPLIT] +[default7]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--fp16-lm-cross-entropy] +[default6]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default6]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default7]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default5]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default7]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default6]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default6]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default7]: [--merge-file MERGE_FILE] +[default7]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default5]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default5]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default6]: [--distributed-backend {nccl,gloo}] +[default7]: [--seq-length SEQ_LENGTH] +[default7]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default6]: [--DDP-impl {local,torch}] +[default6]: [--use-contiguous-buffers-in-ddp] +[default6]: [--no-scatter-gather-tensors-in-pipeline] +[default5]: [--distributed-backend {nccl,gloo}] +[default7]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default5]: [--DDP-impl {local,torch}] +[default5]: [--use-contiguous-buffers-in-ddp] +[default5]: [--no-scatter-gather-tensors-in-pipeline] +[default7]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default7]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default6]: [--local_rank LOCAL_RANK] +[default6]: [--lazy-mpu-init LAZY_MPU_INIT] +[default6]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default7]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default7]: [--num-workers NUM_WORKERS] +[default6]: [--eval-interval EVAL_INTERVAL] +[default7]: [--valid-num-workers VALID_NUM_WORKERS] +[default7]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default7]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default7]: [--data-impl {lazy,cached,mmap,infer}] +[default7]: [--reset-position-ids] [--reset-attention-mask] +[default7]: [--eod-mask-loss] [--loss-on-targets-only] +[default7]: [--norm-target-loss] +[default5]: [--local_rank LOCAL_RANK] +[default5]: [--lazy-mpu-init LAZY_MPU_INIT] +[default5]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default5]: [--eval-interval EVAL_INTERVAL] +[default5]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default7]: [--reweight-loss-based-on-position-frequency] +[default7]: [--noise-density NOISE_DENSITY] +[default7]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default6]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default6]: [--split SPLIT] +[default6]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--prefixlm] [--adlr-autoresume] +[default6]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default7]: [--ict-head-size ICT_HEAD_SIZE] +[default7]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default7]: [--biencoder-shared-query-context-model] +[default7]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default5]: [--split SPLIT] +[default5]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--titles-data-path TITLES_DATA_PATH] +[default7]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default7]: [--use-one-sent-docs] +[default7]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default5]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default6]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default6]: [--merge-file MERGE_FILE] +[default7]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default7]: [--retriever-score-scaling] +[default7]: [--block-data-path BLOCK_DATA_PATH] +[default6]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default6]: [--seq-length SEQ_LENGTH] +[default5]: [--merge-file MERGE_FILE] +[default5]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default6]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default6]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default6]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default6]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default5]: [--seq-length SEQ_LENGTH] +[default7]: [--embedding-path EMBEDDING_PATH] +[default7]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default7]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default7]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default7]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default7]: [--log-params-norm] [--log-num-zeros-in-grad] +[default7]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default5]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default5]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default7]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default7]: [--log-timers-to-tensorboard] +[default7]: [--log-batch-size-to-tensorboard] +[default5]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default6]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default6]: [--num-workers NUM_WORKERS] +[default5]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default5]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default6]: [--valid-num-workers VALID_NUM_WORKERS] +[default6]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default7]: [--no-log-learnig-rate-to-tensorboard] +[default7]: [--no-log-loss-scale-to-tensorboard] +[default5]: [--num-workers NUM_WORKERS] +[default5]: [--valid-num-workers VALID_NUM_WORKERS] +[default5]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default6]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default6]: [--data-impl {lazy,cached,mmap,infer}] +[default6]: [--reset-position-ids] [--reset-attention-mask] +[default6]: [--eod-mask-loss] [--loss-on-targets-only] +[default5]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default5]: [--data-impl {lazy,cached,mmap,infer}] +[default5]: [--reset-position-ids] [--reset-attention-mask] +[default7]: [--log-validation-ppl-to-tensorboard] +[default7]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default6]: [--norm-target-loss] +[default6]: [--reweight-loss-based-on-position-frequency] +[default6]: [--noise-density NOISE_DENSITY] +[default5]: [--eod-mask-loss] [--loss-on-targets-only] +[default6]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default7]: [--zero-contigious-gradients] +[default7]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default6]: [--prefixlm] [--adlr-autoresume] +[default5]: [--norm-target-loss] +[default7]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default7]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default5]: [--reweight-loss-based-on-position-frequency] +[default5]: [--noise-density NOISE_DENSITY] +[default7]: [--scattered-embeddings] [--split-transformers] +[default7]: [--memory-centric-tiled-linear] +[default6]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default6]: [--ict-head-size ICT_HEAD_SIZE] +[default7]: [--tile-factor TILE_FACTOR] +[default7]: [--deepspeed-activation-checkpointing] +[default7]: [--partition-activations] [--contigious-checkpointing] +[default7]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default5]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default5]: [--prefixlm] [--adlr-autoresume] +[default5]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default7]: [--profile-backward] [--deepspeed] +[default7]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default5]: [--ict-head-size ICT_HEAD_SIZE] +[default7]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default7]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default6]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default6]: [--biencoder-shared-query-context-model] +[default6]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default6]: [--titles-data-path TITLES_DATA_PATH] +[default6]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default6]: [--use-one-sent-docs] +[default6]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default5]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default5]: [--biencoder-shared-query-context-model] +[default5]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default6]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default6]: [--retriever-score-scaling] +[default6]: [--block-data-path BLOCK_DATA_PATH] +[default6]: [--embedding-path EMBEDDING_PATH] +[default5]: [--titles-data-path TITLES_DATA_PATH] +[default5]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default5]: [--use-one-sent-docs] +[default6]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default6]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default6]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default6]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default5]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default5]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default6]: [--log-params-norm] [--log-num-zeros-in-grad] +[default6]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default5]: [--retriever-score-scaling] +[default6]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default6]: [--log-timers-to-tensorboard] +[default6]: [--log-batch-size-to-tensorboard] +[default6]: [--no-log-learnig-rate-to-tensorboard] +[default5]: [--block-data-path BLOCK_DATA_PATH] +[default5]: [--embedding-path EMBEDDING_PATH] +[default5]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default6]: [--no-log-loss-scale-to-tensorboard] +[default6]: [--log-validation-ppl-to-tensorboard] +[default6]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default5]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default5]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default5]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default6]: [--zero-contigious-gradients] +[default6]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default5]: [--log-params-norm] [--log-num-zeros-in-grad] +[default5]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default5]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default5]: [--log-timers-to-tensorboard] +[default6]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default6]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default5]: [--log-batch-size-to-tensorboard] +[default5]: [--no-log-learnig-rate-to-tensorboard] +[default5]: [--no-log-loss-scale-to-tensorboard] +[default6]: [--scattered-embeddings] [--split-transformers] +[default6]: [--memory-centric-tiled-linear] +[default6]: [--tile-factor TILE_FACTOR] +[default5]: [--log-validation-ppl-to-tensorboard] +[default5]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default6]: [--deepspeed-activation-checkpointing] +[default6]: [--partition-activations] [--contigious-checkpointing] +[default5]: [--zero-contigious-gradients] +[default5]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default5]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default5]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default6]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default6]: [--profile-backward] [--deepspeed] +[default6]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default6]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default5]: [--scattered-embeddings] [--split-transformers] +[default5]: [--memory-centric-tiled-linear] +[default5]: [--tile-factor TILE_FACTOR] +[default6]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default5]: [--deepspeed-activation-checkpointing] +[default5]: [--partition-activations] [--contigious-checkpointing] +[default5]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default5]: [--profile-backward] [--deepspeed] +[default5]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default5]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default5]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default0]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default1]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default0]: [--hidden-size HIDDEN_SIZE] +[default0]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default1]: [--hidden-size HIDDEN_SIZE] +[default1]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default0]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default0]: [--kv-channels KV_CHANNELS] +[default1]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default0]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default0]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default1]: [--kv-channels KV_CHANNELS] +[default0]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default0]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default0]: [--sync-tp-duplicated-parameters] +[default0]: [--apply-residual-connection-post-layernorm] +[default1]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default1]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default1]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default1]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default1]: [--sync-tp-duplicated-parameters] +[default0]: [--embed-layernorm] [--openai-gelu] +[default1]: [--apply-residual-connection-post-layernorm] +[default1]: [--embed-layernorm] [--openai-gelu] +[default0]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default0]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default0]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default1]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default1]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default0]: [--kill-switch-path KILL_SWITCH_PATH] +[default0]: [--log-level {debug,info,warning,error,critical}] +[default1]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default0]: [--log-level-replica {debug,info,warning,error,critical}] +[default0]: [--attention-dropout ATTENTION_DROPOUT] +[default0]: [--hidden-dropout HIDDEN_DROPOUT] +[default1]: [--kill-switch-path KILL_SWITCH_PATH] +[default1]: [--log-level {debug,info,warning,error,critical}] +[default1]: [--log-level-replica {debug,info,warning,error,critical}] +[default0]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default0]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default0]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default1]: [--attention-dropout ATTENTION_DROPOUT] +[default1]: [--hidden-dropout HIDDEN_DROPOUT] +[default1]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default1]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default1]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default1]: [--micro-batch-size MICRO_BATCH_SIZE] +[default0]: [--micro-batch-size MICRO_BATCH_SIZE] +[default0]: [--batch-size BATCH_SIZE] +[default0]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default1]: [--batch-size BATCH_SIZE] +[default1]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default1]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default0]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default0]: [--checkpoint-activations] +[default1]: [--checkpoint-activations] +[default1]: [--distribute-checkpointed-activations] +[default1]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default0]: [--distribute-checkpointed-activations] +[default0]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default0]: [--train-iters TRAIN_ITERS] +[default0]: [--train-samples TRAIN_SAMPLES] +[default0]: [--train-tokens TRAIN_TOKENS] +[default1]: [--train-iters TRAIN_ITERS] +[default3]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default3]: [--hidden-size HIDDEN_SIZE] +[default3]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default3]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default1]: [--train-samples TRAIN_SAMPLES] +[default1]: [--train-tokens TRAIN_TOKENS] +[default1]: [--log-interval LOG_INTERVAL] +[default1]: [--exit-interval EXIT_INTERVAL] +[default1]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default1]: [--tensorboard-dir TENSORBOARD_DIR] +[default0]: [--log-interval LOG_INTERVAL] +[default0]: [--exit-interval EXIT_INTERVAL] +[default4]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default4]: [--hidden-size HIDDEN_SIZE] +[default4]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default4]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default0]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default0]: [--tensorboard-dir TENSORBOARD_DIR] +[default0]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default0]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default2]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default2]: [--hidden-size HIDDEN_SIZE] +[default4]: [--kv-channels KV_CHANNELS] +[default1]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default3]: [--kv-channels KV_CHANNELS] +[default0]: [--use-bnb-optimizer] +[default0]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default1]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default1]: [--use-bnb-optimizer] +[default4]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default4]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default4]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default4]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default4]: [--sync-tp-duplicated-parameters] +[default3]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default3]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default3]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default4]: [--apply-residual-connection-post-layernorm] +[default4]: [--embed-layernorm] [--openai-gelu] +[default3]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default1]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default1]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default1]: [--eval-only EVAL_ONLY] +[default1]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default3]: [--sync-tp-duplicated-parameters] +[default3]: [--apply-residual-connection-post-layernorm] +[default4]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default3]: [--embed-layernorm] [--openai-gelu] +[default2]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default4]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default4]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default4]: [--kill-switch-path KILL_SWITCH_PATH] +[default4]: [--log-level {debug,info,warning,error,critical}] +[default0]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default2]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default2]: [--kv-channels KV_CHANNELS] +[default0]: [--eval-only EVAL_ONLY] +[default3]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default2]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default2]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default2]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default2]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default4]: [--log-level-replica {debug,info,warning,error,critical}] +[default2]: [--sync-tp-duplicated-parameters] +[default4]: [--attention-dropout ATTENTION_DROPOUT] +[default4]: [--hidden-dropout HIDDEN_DROPOUT] +[default4]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default2]: [--apply-residual-connection-post-layernorm] +[default1]: [--inference] +[default1]: [--abort-on-unmet-fused-kernel-constraints] +[default1]: [--pp-partition-method PP_PARTITION_METHOD] +[default1]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default1]: [--init-method-xavier-uniform] [--lr LR] +[default1]: [--lr-decay-style {constant,linear,cosine}] +[default4]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default3]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default3]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default4]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default4]: [--micro-batch-size MICRO_BATCH_SIZE] +[default4]: [--batch-size BATCH_SIZE] +[default4]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default4]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default3]: [--kill-switch-path KILL_SWITCH_PATH] +[default1]: [--lr-decay-iters LR_DECAY_ITERS] +[default1]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default1]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default1]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default1]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default1]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default3]: [--log-level {debug,info,warning,error,critical}] +[default3]: [--log-level-replica {debug,info,warning,error,critical}] +[default3]: [--attention-dropout ATTENTION_DROPOUT] +[default3]: [--hidden-dropout HIDDEN_DROPOUT] +[default2]: [--embed-layernorm] [--openai-gelu] +[default1]: [--warmup WARMUP] [--min-lr MIN_LR] +[default4]: [--checkpoint-activations] +[default1]: [--override-lr-scheduler] +[default4]: [--distribute-checkpointed-activations] +[default1]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default0]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default1]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default1]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default1]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default1]: [--loss-scale LOSS_SCALE] +[default1]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default1]: [--min-loss-scale MIN_LOSS_SCALE] +[default1]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default3]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default3]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default4]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default4]: [--train-iters TRAIN_ITERS] +[default4]: [--train-samples TRAIN_SAMPLES] +[default4]: [--train-tokens TRAIN_TOKENS] +[default4]: [--log-interval LOG_INTERVAL] +[default0]: [--inference] +[default1]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default1]: [--no-query-key-layer-scaling] +[default0]: [--abort-on-unmet-fused-kernel-constraints] +[default4]: [--exit-interval EXIT_INTERVAL] +[default3]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default3]: [--micro-batch-size MICRO_BATCH_SIZE] +[default0]: [--pp-partition-method PP_PARTITION_METHOD] +[default0]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default0]: [--init-method-xavier-uniform] [--lr LR] +[default0]: [--lr-decay-style {constant,linear,cosine}] +[default1]: [--attention-softmax-in-fp32] +[default0]: [--lr-decay-iters LR_DECAY_ITERS] +[default2]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default2]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default3]: [--batch-size BATCH_SIZE] +[default4]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default4]: [--tensorboard-dir TENSORBOARD_DIR] +[default4]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default2]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default2]: [--kill-switch-path KILL_SWITCH_PATH] +[default1]: [--accumulate-allreduce-grads-in-fp32] +[default2]: [--log-level {debug,info,warning,error,critical}] +[default2]: [--log-level-replica {debug,info,warning,error,critical}] +[default1]: [--fp16-lm-cross-entropy] +[default1]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default1]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default0]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default0]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default3]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default1]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default4]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default4]: [--use-bnb-optimizer] +[default0]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default3]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default3]: [--checkpoint-activations] +[default3]: [--distribute-checkpointed-activations] +[default4]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default4]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default2]: [--attention-dropout ATTENTION_DROPOUT] +[default2]: [--hidden-dropout HIDDEN_DROPOUT] +[default4]: [--eval-only EVAL_ONLY] +[default3]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default2]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default1]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default1]: [--distributed-backend {nccl,gloo}] +[default2]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default0]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default3]: [--train-iters TRAIN_ITERS] +[default2]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default1]: [--DDP-impl {local,torch}] +[default1]: [--use-contiguous-buffers-in-ddp] +[default1]: [--no-scatter-gather-tensors-in-pipeline] +[default3]: [--train-samples TRAIN_SAMPLES] +[default0]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default0]: [--warmup WARMUP] [--min-lr MIN_LR] +[default0]: [--override-lr-scheduler] +[default4]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default4]: [--inference] +[default2]: [--micro-batch-size MICRO_BATCH_SIZE] +[default2]: [--batch-size BATCH_SIZE] +[default2]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default4]: [--abort-on-unmet-fused-kernel-constraints] +[default0]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default2]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default1]: [--local_rank LOCAL_RANK] +[default1]: [--lazy-mpu-init LAZY_MPU_INIT] +[default1]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default2]: [--checkpoint-activations] +[default2]: [--distribute-checkpointed-activations] +[default2]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default2]: [--train-iters TRAIN_ITERS] +[default1]: [--eval-interval EVAL_INTERVAL] +[default1]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default0]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default0]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default4]: [--pp-partition-method PP_PARTITION_METHOD] +[default1]: [--split SPLIT] +[default3]: [--train-tokens TRAIN_TOKENS] +[default2]: [--train-samples TRAIN_SAMPLES] +[default2]: [--train-tokens TRAIN_TOKENS] +[default2]: [--log-interval LOG_INTERVAL] +[default3]: [--log-interval LOG_INTERVAL] +[default3]: [--exit-interval EXIT_INTERVAL] +[default3]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default1]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--exit-interval EXIT_INTERVAL] +[default2]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default2]: [--tensorboard-dir TENSORBOARD_DIR] +[default0]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default0]: [--loss-scale LOSS_SCALE] +[default1]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--tensorboard-dir TENSORBOARD_DIR] +[default3]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default2]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default3]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default2]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default3]: [--use-bnb-optimizer] +[default3]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default4]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default2]: [--use-bnb-optimizer] +[default2]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default2]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default2]: [--eval-only EVAL_ONLY] +[default2]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default4]: [--init-method-xavier-uniform] [--lr LR] +[default4]: [--lr-decay-style {constant,linear,cosine}] +[default4]: [--lr-decay-iters LR_DECAY_ITERS] +[default4]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default4]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default2]: [--inference] +[default2]: [--abort-on-unmet-fused-kernel-constraints] +[default2]: [--pp-partition-method PP_PARTITION_METHOD] +[default3]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default0]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default0]: [--min-loss-scale MIN_LOSS_SCALE] +[default0]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default0]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default4]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default4]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default0]: [--no-query-key-layer-scaling] +[default1]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--attention-softmax-in-fp32] +[default0]: [--accumulate-allreduce-grads-in-fp32] +[default0]: [--fp16-lm-cross-entropy] +[default4]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default4]: [--warmup WARMUP] [--min-lr MIN_LR] +[default4]: [--override-lr-scheduler] +[default4]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default1]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default1]: [--merge-file MERGE_FILE] +[default3]: [--eval-only EVAL_ONLY] +[default3]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default3]: [--inference] +[default3]: [--abort-on-unmet-fused-kernel-constraints] +[default2]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default3]: [--pp-partition-method PP_PARTITION_METHOD] +[default3]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default3]: [--init-method-xavier-uniform] [--lr LR] +[default4]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default2]: [--init-method-xavier-uniform] [--lr LR] +[default2]: [--lr-decay-style {constant,linear,cosine}] +[default2]: [--lr-decay-iters LR_DECAY_ITERS] +[default0]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default0]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default2]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default2]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default2]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default2]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default0]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default0]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default4]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default3]: [--lr-decay-style {constant,linear,cosine}] +[default3]: [--lr-decay-iters LR_DECAY_ITERS] +[default4]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default4]: [--loss-scale LOSS_SCALE] +[default4]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default4]: [--min-loss-scale MIN_LOSS_SCALE] +[default4]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default3]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default4]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default1]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default3]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default3]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default3]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default3]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default3]: [--warmup WARMUP] [--min-lr MIN_LR] +[default3]: [--override-lr-scheduler] +[default0]: [--distributed-backend {nccl,gloo}] +[default0]: [--DDP-impl {local,torch}] +[default2]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default3]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default4]: [--no-query-key-layer-scaling] +[default2]: [--warmup WARMUP] [--min-lr MIN_LR] +[default2]: [--override-lr-scheduler] +[default2]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default2]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default3]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default0]: [--use-contiguous-buffers-in-ddp] +[default1]: [--seq-length SEQ_LENGTH] +[default1]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default0]: [--no-scatter-gather-tensors-in-pipeline] +[default0]: [--local_rank LOCAL_RANK] +[default3]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default3]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default1]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default2]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default2]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default4]: [--attention-softmax-in-fp32] +[default4]: [--accumulate-allreduce-grads-in-fp32] +[default4]: [--fp16-lm-cross-entropy] +[default4]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default2]: [--loss-scale LOSS_SCALE] +[default2]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default2]: [--min-loss-scale MIN_LOSS_SCALE] +[default1]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default1]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default3]: [--loss-scale LOSS_SCALE] +[default2]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default2]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default4]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default0]: [--lazy-mpu-init LAZY_MPU_INIT] +[default0]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default0]: [--eval-interval EVAL_INTERVAL] +[default0]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default1]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default1]: [--num-workers NUM_WORKERS] +[default1]: [--valid-num-workers VALID_NUM_WORKERS] +[default3]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default3]: [--min-loss-scale MIN_LOSS_SCALE] +[default0]: [--split SPLIT] +[default2]: [--no-query-key-layer-scaling] +[default2]: [--attention-softmax-in-fp32] +[default2]: [--accumulate-allreduce-grads-in-fp32] +[default2]: [--fp16-lm-cross-entropy] +[default2]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default2]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default2]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default2]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default2]: [--distributed-backend {nccl,gloo}] +[default2]: [--DDP-impl {local,torch}] +[default2]: [--use-contiguous-buffers-in-ddp] +[default2]: [--no-scatter-gather-tensors-in-pipeline] +[default1]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default2]: [--local_rank LOCAL_RANK] +[default3]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default2]: [--lazy-mpu-init LAZY_MPU_INIT] +[default1]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default2]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default3]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default3]: [--no-query-key-layer-scaling] +[default2]: [--eval-interval EVAL_INTERVAL] +[default2]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default2]: [--split SPLIT] +[default2]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--attention-softmax-in-fp32] +[default2]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default4]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default4]: [--distributed-backend {nccl,gloo}] +[default3]: [--accumulate-allreduce-grads-in-fp32] +[default3]: [--fp16-lm-cross-entropy] +[default3]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default3]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default3]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default3]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default3]: [--distributed-backend {nccl,gloo}] +[default3]: [--DDP-impl {local,torch}] +[default0]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--DDP-impl {local,torch}] +[default4]: [--use-contiguous-buffers-in-ddp] +[default4]: [--no-scatter-gather-tensors-in-pipeline] +[default0]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--data-impl {lazy,cached,mmap,infer}] +[default1]: [--reset-position-ids] [--reset-attention-mask] +[default1]: [--eod-mask-loss] [--loss-on-targets-only] +[default1]: [--norm-target-loss] +[default1]: [--reweight-loss-based-on-position-frequency] +[default0]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--local_rank LOCAL_RANK] +[default4]: [--lazy-mpu-init LAZY_MPU_INIT] +[default0]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--noise-density NOISE_DENSITY] +[default1]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default1]: [--prefixlm] [--adlr-autoresume] +[default1]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default3]: [--use-contiguous-buffers-in-ddp] +[default1]: [--ict-head-size ICT_HEAD_SIZE] +[default4]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default4]: [--eval-interval EVAL_INTERVAL] +[default4]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default2]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default1]: [--biencoder-shared-query-context-model] +[default1]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default1]: [--titles-data-path TITLES_DATA_PATH] +[default1]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default2]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--use-one-sent-docs] +[default4]: [--split SPLIT] +[default4]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default2]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default1]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default1]: [--retriever-score-scaling] +[default1]: [--block-data-path BLOCK_DATA_PATH] +[default1]: [--embedding-path EMBEDDING_PATH] +[default1]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default4]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--no-scatter-gather-tensors-in-pipeline] +[default3]: [--local_rank LOCAL_RANK] +[default1]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default1]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default3]: [--lazy-mpu-init LAZY_MPU_INIT] +[default2]: [--merge-file MERGE_FILE] +[default2]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default2]: [--seq-length SEQ_LENGTH] +[default2]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default2]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default0]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default0]: [--merge-file MERGE_FILE] +[default0]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default3]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default4]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default1]: [--log-params-norm] [--log-num-zeros-in-grad] +[default3]: [--eval-interval EVAL_INTERVAL] +[default3]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default3]: [--split SPLIT] +[default3]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default0]: [--seq-length SEQ_LENGTH] +[default1]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default0]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default0]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default0]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default0]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default3]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default0]: [--num-workers NUM_WORKERS] +[default3]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default3]: [--merge-file MERGE_FILE] +[default3]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default0]: [--valid-num-workers VALID_NUM_WORKERS] +[default3]: [--seq-length SEQ_LENGTH] +[default3]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default3]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default2]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default3]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default2]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default2]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default2]: [--num-workers NUM_WORKERS] +[default3]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default3]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default2]: [--valid-num-workers VALID_NUM_WORKERS] +[default2]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default2]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default0]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default2]: [--data-impl {lazy,cached,mmap,infer}] +[default2]: [--reset-position-ids] [--reset-attention-mask] +[default2]: [--eod-mask-loss] [--loss-on-targets-only] +[default4]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default4]: [--merge-file MERGE_FILE] +[default4]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default4]: [--seq-length SEQ_LENGTH] +[default1]: [--log-timers-to-tensorboard] +[default1]: [--log-batch-size-to-tensorboard] +[default2]: [--norm-target-loss] +[default2]: [--reweight-loss-based-on-position-frequency] +[default2]: [--noise-density NOISE_DENSITY] +[default4]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default0]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default0]: [--data-impl {lazy,cached,mmap,infer}] +[default3]: [--num-workers NUM_WORKERS] +[default2]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default0]: [--reset-position-ids] [--reset-attention-mask] +[default0]: [--eod-mask-loss] [--loss-on-targets-only] +[default1]: [--no-log-learnig-rate-to-tensorboard] +[default1]: [--no-log-loss-scale-to-tensorboard] +[default2]: [--prefixlm] [--adlr-autoresume] +[default2]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default1]: [--log-validation-ppl-to-tensorboard] +[default3]: [--valid-num-workers VALID_NUM_WORKERS] +[default2]: [--ict-head-size ICT_HEAD_SIZE] +[default0]: [--norm-target-loss] +[default0]: [--reweight-loss-based-on-position-frequency] +[default0]: [--noise-density NOISE_DENSITY] +[default3]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default3]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default2]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default2]: [--biencoder-shared-query-context-model] +[default4]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default2]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default2]: [--titles-data-path TITLES_DATA_PATH] +[default4]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default4]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default0]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default0]: [--prefixlm] [--adlr-autoresume] +[default0]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default4]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default4]: [--num-workers NUM_WORKERS] +[default1]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default2]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default2]: [--use-one-sent-docs] +[default1]: [--zero-contigious-gradients] +[default3]: [--data-impl {lazy,cached,mmap,infer}] +[default0]: [--ict-head-size ICT_HEAD_SIZE] +[default4]: [--valid-num-workers VALID_NUM_WORKERS] +[default2]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default2]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default2]: [--retriever-score-scaling] +[default0]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default4]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default1]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default1]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default1]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default2]: [--block-data-path BLOCK_DATA_PATH] +[default1]: [--scattered-embeddings] [--split-transformers] +[default1]: [--memory-centric-tiled-linear] +[default1]: [--tile-factor TILE_FACTOR] +[default2]: [--embedding-path EMBEDDING_PATH] +[default2]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default2]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default2]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default2]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default0]: [--biencoder-shared-query-context-model] +[default0]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default2]: [--log-params-norm] [--log-num-zeros-in-grad] +[default2]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default4]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default4]: [--data-impl {lazy,cached,mmap,infer}] +[default0]: [--titles-data-path TITLES_DATA_PATH] +[default2]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default2]: [--log-timers-to-tensorboard] +[default3]: [--reset-position-ids] [--reset-attention-mask] +[default3]: [--eod-mask-loss] [--loss-on-targets-only] +[default3]: [--norm-target-loss] +[default0]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default0]: [--use-one-sent-docs] +[default0]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default0]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default0]: [--retriever-score-scaling] +[default1]: [--deepspeed-activation-checkpointing] +[default1]: [--partition-activations] [--contigious-checkpointing] +[default1]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default2]: [--log-batch-size-to-tensorboard] +[default2]: [--no-log-learnig-rate-to-tensorboard] +[default2]: [--no-log-loss-scale-to-tensorboard] +[default2]: [--log-validation-ppl-to-tensorboard] +[default2]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default2]: [--zero-contigious-gradients] +[default1]: [--profile-backward] [--deepspeed] +[default1]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default1]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default1]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default2]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default4]: [--reset-position-ids] [--reset-attention-mask] +[default4]: [--eod-mask-loss] [--loss-on-targets-only] +[default4]: [--norm-target-loss] +[default4]: [--reweight-loss-based-on-position-frequency] +[default4]: [--noise-density NOISE_DENSITY] +[default3]: [--reweight-loss-based-on-position-frequency] +[default2]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default2]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default2]: [--scattered-embeddings] [--split-transformers] +[default0]: [--block-data-path BLOCK_DATA_PATH] +[default0]: [--embedding-path EMBEDDING_PATH] +[default0]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default3]: [--noise-density NOISE_DENSITY] +[default0]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default0]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default0]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default2]: [--memory-centric-tiled-linear] +[default4]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default3]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default3]: [--prefixlm] [--adlr-autoresume] +[default3]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default2]: [--tile-factor TILE_FACTOR] +[default2]: [--deepspeed-activation-checkpointing] +[default2]: [--partition-activations] [--contigious-checkpointing] +[default2]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default2]: [--profile-backward] [--deepspeed] +[default0]: [--log-params-norm] [--log-num-zeros-in-grad] +[default0]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default0]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default0]: [--log-timers-to-tensorboard] +[default0]: [--log-batch-size-to-tensorboard] +[default2]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default2]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default4]: [--prefixlm] [--adlr-autoresume] +[default4]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default4]: [--ict-head-size ICT_HEAD_SIZE] +[default3]: [--ict-head-size ICT_HEAD_SIZE] +[default4]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default3]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default4]: [--biencoder-shared-query-context-model] +[default4]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default3]: [--biencoder-shared-query-context-model] +[default3]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default3]: [--titles-data-path TITLES_DATA_PATH] +[default4]: [--titles-data-path TITLES_DATA_PATH] +[default4]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default3]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default3]: [--use-one-sent-docs] +[default0]: [--no-log-learnig-rate-to-tensorboard] +[default0]: [--no-log-loss-scale-to-tensorboard] +[default0]: [--log-validation-ppl-to-tensorboard] +[default0]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default2]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default3]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default3]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default3]: [--retriever-score-scaling] +[default4]: [--use-one-sent-docs] +[default4]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default0]: [--zero-contigious-gradients] +[default0]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default0]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default4]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default4]: [--retriever-score-scaling] +[default0]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default0]: [--scattered-embeddings] [--split-transformers] +[default3]: [--block-data-path BLOCK_DATA_PATH] +[default3]: [--embedding-path EMBEDDING_PATH] +[default3]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default0]: [--memory-centric-tiled-linear] +[default0]: [--tile-factor TILE_FACTOR] +[default0]: [--deepspeed-activation-checkpointing] +[default3]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default3]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default0]: [--partition-activations] [--contigious-checkpointing] +[default4]: [--block-data-path BLOCK_DATA_PATH] +[default4]: [--embedding-path EMBEDDING_PATH] +[default3]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default4]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default4]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default3]: [--log-params-norm] [--log-num-zeros-in-grad] +[default3]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default3]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default3]: [--log-timers-to-tensorboard] +[default3]: [--log-batch-size-to-tensorboard] +[default0]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default0]: [--profile-backward] [--deepspeed] +[default0]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default3]: [--no-log-learnig-rate-to-tensorboard] +[default3]: [--no-log-loss-scale-to-tensorboard] +[default3]: [--log-validation-ppl-to-tensorboard] +[default4]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default4]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default0]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default4]: [--log-params-norm] [--log-num-zeros-in-grad] +[default4]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default3]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default3]: [--zero-contigious-gradients] +[default3]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default0]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default4]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default4]: [--log-timers-to-tensorboard] +[default4]: [--log-batch-size-to-tensorboard] +[default3]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default3]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default3]: [--scattered-embeddings] [--split-transformers] +[default4]: [--no-log-learnig-rate-to-tensorboard] +[default4]: [--no-log-loss-scale-to-tensorboard] +[default3]: [--memory-centric-tiled-linear] +[default4]: [--log-validation-ppl-to-tensorboard] +[default4]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default4]: [--zero-contigious-gradients] +[default3]: [--tile-factor TILE_FACTOR] +[default4]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default4]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default3]: [--deepspeed-activation-checkpointing] +[default3]: [--partition-activations] [--contigious-checkpointing] +[default3]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default3]: [--profile-backward] [--deepspeed] +[default4]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default4]: [--scattered-embeddings] [--split-transformers] +[default3]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default4]: [--memory-centric-tiled-linear] +[default4]: [--tile-factor TILE_FACTOR] +[default3]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default4]: [--deepspeed-activation-checkpointing] +[default4]: [--partition-activations] [--contigious-checkpointing] +[default3]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default4]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default4]: [--profile-backward] [--deepspeed] +[default4]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default4]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default4]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default6]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default7]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default7]: [--hidden-size HIDDEN_SIZE] +[default6]: [--hidden-size HIDDEN_SIZE] +[default7]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default7]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default7]: [--kv-channels KV_CHANNELS] +[default7]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default7]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default7]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default7]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default6]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default6]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default7]: [--sync-tp-duplicated-parameters] +[default6]: [--kv-channels KV_CHANNELS] +[default6]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default6]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default7]: [--apply-residual-connection-post-layernorm] +[default7]: [--embed-layernorm] [--openai-gelu] +[default7]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default6]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default6]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default7]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default7]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default6]: [--sync-tp-duplicated-parameters] +[default6]: [--apply-residual-connection-post-layernorm] +[default5]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default5]: [--hidden-size HIDDEN_SIZE] +[default5]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default6]: [--embed-layernorm] [--openai-gelu] +[default7]: [--kill-switch-path KILL_SWITCH_PATH] +[default7]: [--log-level {debug,info,warning,error,critical}] +[default7]: [--log-level-replica {debug,info,warning,error,critical}] +[default7]: [--attention-dropout ATTENTION_DROPOUT] +[default6]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default6]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default5]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default5]: [--kv-channels KV_CHANNELS] +[default5]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default6]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default6]: [--kill-switch-path KILL_SWITCH_PATH] +[default6]: [--log-level {debug,info,warning,error,critical}] +[default5]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default5]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default7]: [--hidden-dropout HIDDEN_DROPOUT] +[default5]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default5]: [--sync-tp-duplicated-parameters] +[default6]: [--log-level-replica {debug,info,warning,error,critical}] +[default6]: [--attention-dropout ATTENTION_DROPOUT] +[default7]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default7]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default7]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default5]: [--apply-residual-connection-post-layernorm] +[default5]: [--embed-layernorm] [--openai-gelu] +[default6]: [--hidden-dropout HIDDEN_DROPOUT] +[default6]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default6]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default5]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default5]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default5]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default5]: [--kill-switch-path KILL_SWITCH_PATH] +[default5]: [--log-level {debug,info,warning,error,critical}] +[default5]: [--log-level-replica {debug,info,warning,error,critical}] +[default7]: [--micro-batch-size MICRO_BATCH_SIZE] +[default7]: [--batch-size BATCH_SIZE] +[default5]: [--attention-dropout ATTENTION_DROPOUT] +[default5]: [--hidden-dropout HIDDEN_DROPOUT] +[default6]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default6]: [--micro-batch-size MICRO_BATCH_SIZE] +[default6]: [--batch-size BATCH_SIZE] +[default7]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default7]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default7]: [--checkpoint-activations] +[default7]: [--distribute-checkpointed-activations] +[default5]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default7]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default5]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default5]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default6]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default6]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default6]: [--checkpoint-activations] +[default5]: [--micro-batch-size MICRO_BATCH_SIZE] +[default5]: [--batch-size BATCH_SIZE] +[default7]: [--train-iters TRAIN_ITERS] +[default7]: [--train-samples TRAIN_SAMPLES] +[default7]: [--train-tokens TRAIN_TOKENS] +[default6]: [--distribute-checkpointed-activations] +[default6]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default6]: [--train-iters TRAIN_ITERS] +[default6]: [--train-samples TRAIN_SAMPLES] +[default7]: [--log-interval LOG_INTERVAL] +[default7]: [--exit-interval EXIT_INTERVAL] +[default6]: [--train-tokens TRAIN_TOKENS] +[default6]: [--log-interval LOG_INTERVAL] +[default7]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default7]: [--tensorboard-dir TENSORBOARD_DIR] +[default6]: [--exit-interval EXIT_INTERVAL] +[default6]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default6]: [--tensorboard-dir TENSORBOARD_DIR] +[default7]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default7]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default7]: [--use-bnb-optimizer] +[default6]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default6]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default5]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default5]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default5]: [--checkpoint-activations] +[default6]: [--use-bnb-optimizer] +[default6]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default6]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default5]: [--distribute-checkpointed-activations] +[default5]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default6]: [--eval-only EVAL_ONLY] +[default7]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default7]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default7]: [--eval-only EVAL_ONLY] +[default6]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default6]: [--inference] +[default6]: [--abort-on-unmet-fused-kernel-constraints] +[default7]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default7]: [--inference] +[default6]: [--pp-partition-method PP_PARTITION_METHOD] +[default6]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default5]: [--train-iters TRAIN_ITERS] +[default5]: [--train-samples TRAIN_SAMPLES] +[default7]: [--abort-on-unmet-fused-kernel-constraints] +[default5]: [--train-tokens TRAIN_TOKENS] +[default7]: [--pp-partition-method PP_PARTITION_METHOD] +[default7]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default7]: [--init-method-xavier-uniform] [--lr LR] +[default7]: [--lr-decay-style {constant,linear,cosine}] +[default6]: [--init-method-xavier-uniform] [--lr LR] +[default6]: [--lr-decay-style {constant,linear,cosine}] +[default7]: [--lr-decay-iters LR_DECAY_ITERS] +[default7]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default7]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default6]: [--lr-decay-iters LR_DECAY_ITERS] +[default6]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default7]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default7]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default6]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default6]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default5]: [--log-interval LOG_INTERVAL] +[default5]: [--exit-interval EXIT_INTERVAL] +[default7]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default7]: [--warmup WARMUP] [--min-lr MIN_LR] +[default5]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default5]: [--tensorboard-dir TENSORBOARD_DIR] +[default6]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default6]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default6]: [--warmup WARMUP] [--min-lr MIN_LR] +[default5]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default7]: [--override-lr-scheduler] +[default7]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default6]: [--override-lr-scheduler] +[default6]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default7]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default7]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default6]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default6]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default7]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default7]: [--loss-scale LOSS_SCALE] +[default7]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default7]: [--min-loss-scale MIN_LOSS_SCALE] +[default6]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default6]: [--loss-scale LOSS_SCALE] +[default5]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default5]: [--use-bnb-optimizer] +[default7]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default7]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default5]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default5]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default6]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default6]: [--min-loss-scale MIN_LOSS_SCALE] +[default5]: [--eval-only EVAL_ONLY] +[default5]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default7]: [--no-query-key-layer-scaling] +[default5]: [--inference] +[default5]: [--abort-on-unmet-fused-kernel-constraints] +[default6]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default6]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default6]: [--no-query-key-layer-scaling] +[default6]: [--attention-softmax-in-fp32] +[default5]: [--pp-partition-method PP_PARTITION_METHOD] +[default5]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default7]: [--attention-softmax-in-fp32] +[default5]: [--init-method-xavier-uniform] [--lr LR] +[default5]: [--lr-decay-style {constant,linear,cosine}] +[default7]: [--accumulate-allreduce-grads-in-fp32] +[default5]: [--lr-decay-iters LR_DECAY_ITERS] +[default5]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default7]: [--fp16-lm-cross-entropy] +[default7]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default7]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default7]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default6]: [--accumulate-allreduce-grads-in-fp32] +[default6]: [--fp16-lm-cross-entropy] +[default5]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default5]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default7]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default5]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default5]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default5]: [--warmup WARMUP] [--min-lr MIN_LR] +[default7]: [--distributed-backend {nccl,gloo}] +[default6]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default7]: [--DDP-impl {local,torch}] +[default7]: [--use-contiguous-buffers-in-ddp] +[default7]: [--no-scatter-gather-tensors-in-pipeline] +[default7]: [--local_rank LOCAL_RANK] +[default6]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default6]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default6]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default6]: [--distributed-backend {nccl,gloo}] +[default7]: [--lazy-mpu-init LAZY_MPU_INIT] +[default7]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default7]: [--eval-interval EVAL_INTERVAL] +[default7]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default7]: [--split SPLIT] +[default7]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default6]: [--DDP-impl {local,torch}] +[default6]: [--use-contiguous-buffers-in-ddp] +[default6]: [--no-scatter-gather-tensors-in-pipeline] +[default5]: [--override-lr-scheduler] +[default5]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default7]: [--merge-file MERGE_FILE] +[default5]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default5]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default7]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default5]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default5]: [--loss-scale LOSS_SCALE] +[default5]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default7]: [--seq-length SEQ_LENGTH] +[default5]: [--min-loss-scale MIN_LOSS_SCALE] +[default5]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default6]: [--local_rank LOCAL_RANK] +[default6]: [--lazy-mpu-init LAZY_MPU_INIT] +[default5]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default5]: [--no-query-key-layer-scaling] +[default6]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default6]: [--eval-interval EVAL_INTERVAL] +[default6]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default6]: [--split SPLIT] +[default5]: [--attention-softmax-in-fp32] +[default5]: [--accumulate-allreduce-grads-in-fp32] +[default5]: [--fp16-lm-cross-entropy] +[default5]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default7]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default7]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default5]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default5]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default5]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default7]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default5]: [--distributed-backend {nccl,gloo}] +[default5]: [--DDP-impl {local,torch}] +[default5]: [--use-contiguous-buffers-in-ddp] +[default7]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default5]: [--no-scatter-gather-tensors-in-pipeline] +[default5]: [--local_rank LOCAL_RANK] +[default7]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default7]: [--num-workers NUM_WORKERS] +[default5]: [--lazy-mpu-init LAZY_MPU_INIT] +[default5]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default5]: [--eval-interval EVAL_INTERVAL] +[default7]: [--valid-num-workers VALID_NUM_WORKERS] +[default5]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default5]: [--split SPLIT] +[default7]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default5]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default5]: [--merge-file MERGE_FILE] +[default5]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default5]: [--seq-length SEQ_LENGTH] +[default5]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default7]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default6]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default5]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default7]: [--data-impl {lazy,cached,mmap,infer}] +[default5]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default5]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default7]: [--reset-position-ids] [--reset-attention-mask] +[default5]: [--num-workers NUM_WORKERS] +[default5]: [--valid-num-workers VALID_NUM_WORKERS] +[default6]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default7]: [--eod-mask-loss] [--loss-on-targets-only] +[default7]: [--norm-target-loss] +[default6]: [--merge-file MERGE_FILE] +[default6]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default5]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default5]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default5]: [--data-impl {lazy,cached,mmap,infer}] +[default5]: [--reset-position-ids] [--reset-attention-mask] +[default7]: [--reweight-loss-based-on-position-frequency] +[default7]: [--noise-density NOISE_DENSITY] +[default7]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default6]: [--seq-length SEQ_LENGTH] +[default6]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default5]: [--eod-mask-loss] [--loss-on-targets-only] +[default5]: [--norm-target-loss] +[default7]: [--prefixlm] [--adlr-autoresume] +[default5]: [--reweight-loss-based-on-position-frequency] +[default5]: [--noise-density NOISE_DENSITY] +[default7]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default7]: [--ict-head-size ICT_HEAD_SIZE] +[default5]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default5]: [--prefixlm] [--adlr-autoresume] +[default5]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default7]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default7]: [--biencoder-shared-query-context-model] +[default7]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default5]: [--ict-head-size ICT_HEAD_SIZE] +[default5]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default5]: [--biencoder-shared-query-context-model] +[default5]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default5]: [--titles-data-path TITLES_DATA_PATH] +[default5]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default5]: [--use-one-sent-docs] +[default7]: [--titles-data-path TITLES_DATA_PATH] +[default5]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default5]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default5]: [--retriever-score-scaling] +[default7]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default7]: [--use-one-sent-docs] +[default6]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default6]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default6]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default7]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default7]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default7]: [--retriever-score-scaling] +[default7]: [--block-data-path BLOCK_DATA_PATH] +[default7]: [--embedding-path EMBEDDING_PATH] +[default7]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default7]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default6]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default6]: [--num-workers NUM_WORKERS] +[default6]: [--valid-num-workers VALID_NUM_WORKERS] +[default5]: [--block-data-path BLOCK_DATA_PATH] +[default5]: [--embedding-path EMBEDDING_PATH] +[default7]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default7]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default5]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default5]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default6]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default6]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default5]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default5]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default7]: [--log-params-norm] [--log-num-zeros-in-grad] +[default5]: [--log-params-norm] [--log-num-zeros-in-grad] +[default6]: [--data-impl {lazy,cached,mmap,infer}] +[default6]: [--reset-position-ids] [--reset-attention-mask] +[default5]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default5]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default7]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default7]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default7]: [--log-timers-to-tensorboard] +[default7]: [--log-batch-size-to-tensorboard] +[default6]: [--eod-mask-loss] [--loss-on-targets-only] +[default6]: [--norm-target-loss] +[default5]: [--log-timers-to-tensorboard] +[default5]: [--log-batch-size-to-tensorboard] +[default7]: [--no-log-learnig-rate-to-tensorboard] +[default5]: [--no-log-learnig-rate-to-tensorboard] +[default5]: [--no-log-loss-scale-to-tensorboard] +[default5]: [--log-validation-ppl-to-tensorboard] +[default7]: [--no-log-loss-scale-to-tensorboard] +[default5]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default5]: [--zero-contigious-gradients] +[default6]: [--reweight-loss-based-on-position-frequency] +[default6]: [--noise-density NOISE_DENSITY] +[default6]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default6]: [--prefixlm] [--adlr-autoresume] +[default7]: [--log-validation-ppl-to-tensorboard] +[default7]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default5]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default5]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default5]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default7]: [--zero-contigious-gradients] +[default7]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default7]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default5]: [--scattered-embeddings] [--split-transformers] +[default5]: [--memory-centric-tiled-linear] +[default5]: [--tile-factor TILE_FACTOR] +[default5]: [--deepspeed-activation-checkpointing] +[default5]: [--partition-activations] [--contigious-checkpointing] +[default7]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default5]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default5]: [--profile-backward] [--deepspeed] +[default6]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default6]: [--ict-head-size ICT_HEAD_SIZE] +[default7]: [--scattered-embeddings] [--split-transformers] +[default6]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default6]: [--biencoder-shared-query-context-model] +[default6]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default7]: [--memory-centric-tiled-linear] +[default7]: [--tile-factor TILE_FACTOR] +[default6]: [--titles-data-path TITLES_DATA_PATH] +[default6]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default7]: [--deepspeed-activation-checkpointing] +[default7]: [--partition-activations] [--contigious-checkpointing] +[default6]: [--use-one-sent-docs] +[default6]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default5]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default5]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default7]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default5]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default7]: [--profile-backward] [--deepspeed] +[default6]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default6]: [--retriever-score-scaling] +[default7]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default7]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default7]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default6]: [--block-data-path BLOCK_DATA_PATH] +[default6]: [--embedding-path EMBEDDING_PATH] +[default6]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default6]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default6]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default6]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default6]: [--log-params-norm] [--log-num-zeros-in-grad] +[default6]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default6]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default6]: [--log-timers-to-tensorboard] +[default6]: [--log-batch-size-to-tensorboard] +[default6]: [--no-log-learnig-rate-to-tensorboard] +[default6]: [--no-log-loss-scale-to-tensorboard] +[default6]: [--log-validation-ppl-to-tensorboard] +[default6]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default6]: [--zero-contigious-gradients] +[default6]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default6]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default6]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default6]: [--scattered-embeddings] [--split-transformers] +[default6]: [--memory-centric-tiled-linear] +[default6]: [--tile-factor TILE_FACTOR] +[default6]: [--deepspeed-activation-checkpointing] +[default6]: [--partition-activations] [--contigious-checkpointing] +[default6]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default6]: [--profile-backward] [--deepspeed] +[default6]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default6]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default6]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default7]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default6]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default6]: [--hidden-size HIDDEN_SIZE] +[default7]: [--hidden-size HIDDEN_SIZE] +[default7]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default5]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default5]: [--hidden-size HIDDEN_SIZE] +[default5]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default5]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default7]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default7]: [--kv-channels KV_CHANNELS] +[default6]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default5]: [--kv-channels KV_CHANNELS] +[default5]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default5]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default6]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default7]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default5]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default5]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default5]: [--sync-tp-duplicated-parameters] +[default5]: [--apply-residual-connection-post-layernorm] +[default5]: [--embed-layernorm] [--openai-gelu] +[default7]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default7]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default5]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default5]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default5]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default7]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default7]: [--sync-tp-duplicated-parameters] +[default7]: [--apply-residual-connection-post-layernorm] +[default7]: [--embed-layernorm] [--openai-gelu] +[default7]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default7]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default7]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default6]: [--kv-channels KV_CHANNELS] +[default6]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default6]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default5]: [--kill-switch-path KILL_SWITCH_PATH] +[default5]: [--log-level {debug,info,warning,error,critical}] +[default5]: [--log-level-replica {debug,info,warning,error,critical}] +[default7]: [--kill-switch-path KILL_SWITCH_PATH] +[default5]: [--attention-dropout ATTENTION_DROPOUT] +[default7]: [--log-level {debug,info,warning,error,critical}] +[default5]: [--hidden-dropout HIDDEN_DROPOUT] +[default5]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default6]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default6]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default5]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default5]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default5]: [--micro-batch-size MICRO_BATCH_SIZE] +[default7]: [--log-level-replica {debug,info,warning,error,critical}] +[default6]: [--sync-tp-duplicated-parameters] +[default6]: [--apply-residual-connection-post-layernorm] +[default6]: [--embed-layernorm] [--openai-gelu] +[default6]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default7]: [--attention-dropout ATTENTION_DROPOUT] +[default7]: [--hidden-dropout HIDDEN_DROPOUT] +[default6]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default6]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default6]: [--kill-switch-path KILL_SWITCH_PATH] +[default7]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default7]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default6]: [--log-level {debug,info,warning,error,critical}] +[default6]: [--log-level-replica {debug,info,warning,error,critical}] +[default7]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default7]: [--micro-batch-size MICRO_BATCH_SIZE] +[default7]: [--batch-size BATCH_SIZE] +[default7]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default7]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default5]: [--batch-size BATCH_SIZE] +[default5]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default5]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default7]: [--checkpoint-activations] +[default6]: [--attention-dropout ATTENTION_DROPOUT] +[default6]: [--hidden-dropout HIDDEN_DROPOUT] +[default6]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default7]: [--distribute-checkpointed-activations] +[default6]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default7]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default7]: [--train-iters TRAIN_ITERS] +[default5]: [--checkpoint-activations] +[default7]: [--train-samples TRAIN_SAMPLES] +[default5]: [--distribute-checkpointed-activations] +[default5]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default7]: [--train-tokens TRAIN_TOKENS] +[default6]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default6]: [--micro-batch-size MICRO_BATCH_SIZE] +[default5]: [--train-iters TRAIN_ITERS] +[default5]: [--train-samples TRAIN_SAMPLES] +[default7]: [--log-interval LOG_INTERVAL] +[default7]: [--exit-interval EXIT_INTERVAL] +[default5]: [--train-tokens TRAIN_TOKENS] +[default5]: [--log-interval LOG_INTERVAL] +[default5]: [--exit-interval EXIT_INTERVAL] +[default7]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default7]: [--tensorboard-dir TENSORBOARD_DIR] +[default5]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default7]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default6]: [--batch-size BATCH_SIZE] +[default6]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default7]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default7]: [--use-bnb-optimizer] +[default5]: [--tensorboard-dir TENSORBOARD_DIR] +[default6]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default6]: [--checkpoint-activations] +[default5]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default5]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default5]: [--use-bnb-optimizer] +[default7]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default6]: [--distribute-checkpointed-activations] +[default6]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default7]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default7]: [--eval-only EVAL_ONLY] +[default5]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default5]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default6]: [--train-iters TRAIN_ITERS] +[default6]: [--train-samples TRAIN_SAMPLES] +[default5]: [--eval-only EVAL_ONLY] +[default5]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default5]: [--inference] +[default5]: [--abort-on-unmet-fused-kernel-constraints] +[default6]: [--train-tokens TRAIN_TOKENS] +[default6]: [--log-interval LOG_INTERVAL] +[default5]: [--pp-partition-method PP_PARTITION_METHOD] +[default5]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default5]: [--init-method-xavier-uniform] [--lr LR] +[default5]: [--lr-decay-style {constant,linear,cosine}] +[default7]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default6]: [--exit-interval EXIT_INTERVAL] +[default6]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default6]: [--tensorboard-dir TENSORBOARD_DIR] +[default7]: [--inference] +[default7]: [--abort-on-unmet-fused-kernel-constraints] +[default7]: [--pp-partition-method PP_PARTITION_METHOD] +[default6]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default7]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default7]: [--init-method-xavier-uniform] [--lr LR] +[default5]: [--lr-decay-iters LR_DECAY_ITERS] +[default7]: [--lr-decay-style {constant,linear,cosine}] +[default5]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default5]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default7]: [--lr-decay-iters LR_DECAY_ITERS] +[default6]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default6]: [--use-bnb-optimizer] +[default6]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default7]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default7]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default5]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default7]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default5]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default7]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default6]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default6]: [--eval-only EVAL_ONLY] +[default7]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default7]: [--warmup WARMUP] [--min-lr MIN_LR] +[default6]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default6]: [--inference] +[default5]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default5]: [--warmup WARMUP] [--min-lr MIN_LR] +[default6]: [--abort-on-unmet-fused-kernel-constraints] +[default6]: [--pp-partition-method PP_PARTITION_METHOD] +[default7]: [--override-lr-scheduler] +[default7]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default5]: [--override-lr-scheduler] +[default5]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default5]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default7]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default7]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default7]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default7]: [--loss-scale LOSS_SCALE] +[default6]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default6]: [--init-method-xavier-uniform] [--lr LR] +[default7]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default7]: [--min-loss-scale MIN_LOSS_SCALE] +[default5]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default7]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default6]: [--lr-decay-style {constant,linear,cosine}] +[default6]: [--lr-decay-iters LR_DECAY_ITERS] +[default7]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default7]: [--no-query-key-layer-scaling] +[default6]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default6]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default7]: [--attention-softmax-in-fp32] +[default7]: [--accumulate-allreduce-grads-in-fp32] +[default6]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default6]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default7]: [--fp16-lm-cross-entropy] +[default7]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default6]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default6]: [--warmup WARMUP] [--min-lr MIN_LR] +[default7]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default7]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default6]: [--override-lr-scheduler] +[default6]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default6]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default6]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default6]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default6]: [--loss-scale LOSS_SCALE] +[default5]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default5]: [--loss-scale LOSS_SCALE] +[default5]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default7]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default5]: [--min-loss-scale MIN_LOSS_SCALE] +[default5]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default5]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default7]: [--distributed-backend {nccl,gloo}] +[default6]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default6]: [--min-loss-scale MIN_LOSS_SCALE] +[default7]: [--DDP-impl {local,torch}] +[default7]: [--use-contiguous-buffers-in-ddp] +[default5]: [--no-query-key-layer-scaling] +[default5]: [--attention-softmax-in-fp32] +[default7]: [--no-scatter-gather-tensors-in-pipeline] +[default5]: [--accumulate-allreduce-grads-in-fp32] +[default7]: [--local_rank LOCAL_RANK] +[default5]: [--fp16-lm-cross-entropy] +[default5]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default7]: [--lazy-mpu-init LAZY_MPU_INIT] +[default7]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default7]: [--eval-interval EVAL_INTERVAL] +[default7]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default5]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default6]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default6]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default6]: [--no-query-key-layer-scaling] +[default5]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default5]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default7]: [--split SPLIT] +[default6]: [--attention-softmax-in-fp32] +[default6]: [--accumulate-allreduce-grads-in-fp32] +[default7]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--fp16-lm-cross-entropy] +[default6]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default5]: [--distributed-backend {nccl,gloo}] +[default5]: [--DDP-impl {local,torch}] +[default5]: [--use-contiguous-buffers-in-ddp] +[default7]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default6]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default7]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--no-scatter-gather-tensors-in-pipeline] +[default5]: [--local_rank LOCAL_RANK] +[default5]: [--lazy-mpu-init LAZY_MPU_INIT] +[default6]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default6]: [--distributed-backend {nccl,gloo}] +[default7]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default7]: [--merge-file MERGE_FILE] +[default5]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default5]: [--eval-interval EVAL_INTERVAL] +[default5]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default6]: [--DDP-impl {local,torch}] +[default6]: [--use-contiguous-buffers-in-ddp] +[default6]: [--no-scatter-gather-tensors-in-pipeline] +[default5]: [--split SPLIT] +[default5]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default6]: [--local_rank LOCAL_RANK] +[default6]: [--lazy-mpu-init LAZY_MPU_INIT] +[default7]: [--seq-length SEQ_LENGTH] +[default7]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default7]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default7]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default7]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default5]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default7]: [--num-workers NUM_WORKERS] +[default7]: [--valid-num-workers VALID_NUM_WORKERS] +[default7]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default7]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default6]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default6]: [--eval-interval EVAL_INTERVAL] +[default6]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default5]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--split SPLIT] +[default6]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--data-impl {lazy,cached,mmap,infer}] +[default7]: [--reset-position-ids] [--reset-attention-mask] +[default6]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--eod-mask-loss] [--loss-on-targets-only] +[default7]: [--norm-target-loss] +[default6]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default5]: [--merge-file MERGE_FILE] +[default7]: [--reweight-loss-based-on-position-frequency] +[default6]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default7]: [--noise-density NOISE_DENSITY] +[default7]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default7]: [--prefixlm] [--adlr-autoresume] +[default5]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default5]: [--seq-length SEQ_LENGTH] +[default5]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default7]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default6]: [--merge-file MERGE_FILE] +[default6]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default5]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default5]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default6]: [--seq-length SEQ_LENGTH] +[default6]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default5]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default5]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default6]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default6]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default6]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default6]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default7]: [--ict-head-size ICT_HEAD_SIZE] +[default7]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default7]: [--biencoder-shared-query-context-model] +[default7]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default7]: [--titles-data-path TITLES_DATA_PATH] +[default7]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default6]: [--num-workers NUM_WORKERS] +[default6]: [--valid-num-workers VALID_NUM_WORKERS] +[default7]: [--use-one-sent-docs] +[default7]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default6]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default6]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default5]: [--num-workers NUM_WORKERS] +[default5]: [--valid-num-workers VALID_NUM_WORKERS] +[default7]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default6]: [--data-impl {lazy,cached,mmap,infer}] +[default6]: [--reset-position-ids] [--reset-attention-mask] +[default7]: [--retriever-score-scaling] +[default5]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default5]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default5]: [--data-impl {lazy,cached,mmap,infer}] +[default5]: [--reset-position-ids] [--reset-attention-mask] +[default6]: [--eod-mask-loss] [--loss-on-targets-only] +[default6]: [--norm-target-loss] +[default5]: [--eod-mask-loss] [--loss-on-targets-only] +[default5]: [--norm-target-loss] +[default7]: [--block-data-path BLOCK_DATA_PATH] +[default7]: [--embedding-path EMBEDDING_PATH] +[default5]: [--reweight-loss-based-on-position-frequency] +[default7]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default5]: [--noise-density NOISE_DENSITY] +[default7]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default7]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default7]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default7]: [--log-params-norm] [--log-num-zeros-in-grad] +[default7]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default5]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default7]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default5]: [--prefixlm] [--adlr-autoresume] +[default5]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default6]: [--reweight-loss-based-on-position-frequency] +[default6]: [--noise-density NOISE_DENSITY] +[default7]: [--log-timers-to-tensorboard] +[default7]: [--log-batch-size-to-tensorboard] +[default6]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default6]: [--prefixlm] [--adlr-autoresume] +[default5]: [--ict-head-size ICT_HEAD_SIZE] +[default5]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default5]: [--biencoder-shared-query-context-model] +[default6]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default6]: [--ict-head-size ICT_HEAD_SIZE] +[default5]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default5]: [--titles-data-path TITLES_DATA_PATH] +[default6]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default6]: [--biencoder-shared-query-context-model] +[default5]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default5]: [--use-one-sent-docs] +[default5]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default7]: [--no-log-learnig-rate-to-tensorboard] +[default6]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default6]: [--titles-data-path TITLES_DATA_PATH] +[default7]: [--no-log-loss-scale-to-tensorboard] +[default7]: [--log-validation-ppl-to-tensorboard] +[default5]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default5]: [--retriever-score-scaling] +[default7]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default5]: [--block-data-path BLOCK_DATA_PATH] +[default5]: [--embedding-path EMBEDDING_PATH] +[default5]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default7]: [--zero-contigious-gradients] +[default7]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default5]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default7]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default6]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default6]: [--use-one-sent-docs] +[default6]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default6]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default7]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default7]: [--scattered-embeddings] [--split-transformers] +[default7]: [--memory-centric-tiled-linear] +[default7]: [--tile-factor TILE_FACTOR] +[default6]: [--retriever-score-scaling] +[default5]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default7]: [--deepspeed-activation-checkpointing] +[default7]: [--partition-activations] [--contigious-checkpointing] +[default5]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default7]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default6]: [--block-data-path BLOCK_DATA_PATH] +[default7]: [--profile-backward] [--deepspeed] +[default7]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default7]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default5]: [--log-params-norm] [--log-num-zeros-in-grad] +[default5]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default6]: [--embedding-path EMBEDDING_PATH] +[default6]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default6]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default7]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default6]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default6]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default6]: [--log-params-norm] [--log-num-zeros-in-grad] +[default6]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default5]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default5]: [--log-timers-to-tensorboard] +[default6]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default6]: [--log-timers-to-tensorboard] +[default5]: [--log-batch-size-to-tensorboard] +[default5]: [--no-log-learnig-rate-to-tensorboard] +[default5]: [--no-log-loss-scale-to-tensorboard] +[default6]: [--log-batch-size-to-tensorboard] +[default6]: [--no-log-learnig-rate-to-tensorboard] +[default6]: [--no-log-loss-scale-to-tensorboard] +[default5]: [--log-validation-ppl-to-tensorboard] +[default5]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default6]: [--log-validation-ppl-to-tensorboard] +[default6]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default6]: [--zero-contigious-gradients] +[default5]: [--zero-contigious-gradients] +[default5]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default5]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default5]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default5]: [--scattered-embeddings] [--split-transformers] +[default6]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default6]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default5]: [--memory-centric-tiled-linear] +[default6]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default6]: [--scattered-embeddings] [--split-transformers] +[default6]: [--memory-centric-tiled-linear] +[default5]: [--tile-factor TILE_FACTOR] +[default5]: [--deepspeed-activation-checkpointing] +[default5]: [--partition-activations] [--contigious-checkpointing] +[default6]: [--tile-factor TILE_FACTOR] +[default6]: [--deepspeed-activation-checkpointing] +[default6]: [--partition-activations] [--contigious-checkpointing] +[default5]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default5]: [--profile-backward] [--deepspeed] +[default6]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default6]: [--profile-backward] [--deepspeed] +[default6]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default5]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default5]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default5]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default6]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default6]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default3]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default1]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default1]: [--hidden-size HIDDEN_SIZE] +[default3]: [--hidden-size HIDDEN_SIZE] +[default3]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default1]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default1]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default3]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default3]: [--kv-channels KV_CHANNELS] +[default1]: [--kv-channels KV_CHANNELS] +[default1]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default3]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default3]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default3]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default1]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default1]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default1]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default3]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default3]: [--sync-tp-duplicated-parameters] +[default1]: [--sync-tp-duplicated-parameters] +[default1]: [--apply-residual-connection-post-layernorm] +[default1]: [--embed-layernorm] [--openai-gelu] +[default3]: [--apply-residual-connection-post-layernorm] +[default3]: [--embed-layernorm] [--openai-gelu] +[default3]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default3]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default3]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default1]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default1]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default3]: [--kill-switch-path KILL_SWITCH_PATH] +[default3]: [--log-level {debug,info,warning,error,critical}] +[default1]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default1]: [--kill-switch-path KILL_SWITCH_PATH] +[default3]: [--log-level-replica {debug,info,warning,error,critical}] +[default3]: [--attention-dropout ATTENTION_DROPOUT] +[default3]: [--hidden-dropout HIDDEN_DROPOUT] +[default1]: [--log-level {debug,info,warning,error,critical}] +[default1]: [--log-level-replica {debug,info,warning,error,critical}] +[default1]: [--attention-dropout ATTENTION_DROPOUT] +[default3]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default3]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default3]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default1]: [--hidden-dropout HIDDEN_DROPOUT] +[default1]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default1]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default3]: [--micro-batch-size MICRO_BATCH_SIZE] +[default3]: [--batch-size BATCH_SIZE] +[default3]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default1]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default1]: [--micro-batch-size MICRO_BATCH_SIZE] +[default3]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default3]: [--checkpoint-activations] +[default3]: [--distribute-checkpointed-activations] +[default1]: [--batch-size BATCH_SIZE] +[default1]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default3]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default3]: [--train-iters TRAIN_ITERS] +[default3]: [--train-samples TRAIN_SAMPLES] +[default3]: [--train-tokens TRAIN_TOKENS] +[default1]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default1]: [--checkpoint-activations] +[default1]: [--distribute-checkpointed-activations] +[default1]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default3]: [--log-interval LOG_INTERVAL] +[default3]: [--exit-interval EXIT_INTERVAL] +[default1]: [--train-iters TRAIN_ITERS] +[default1]: [--train-samples TRAIN_SAMPLES] +[default3]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default3]: [--tensorboard-dir TENSORBOARD_DIR] +[default1]: [--train-tokens TRAIN_TOKENS] +[default1]: [--log-interval LOG_INTERVAL] +[default1]: [--exit-interval EXIT_INTERVAL] +[default1]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default3]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default3]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default1]: [--tensorboard-dir TENSORBOARD_DIR] +[default1]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default1]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default1]: [--use-bnb-optimizer] +[default1]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default3]: [--use-bnb-optimizer] +[default3]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default3]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default1]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default3]: [--eval-only EVAL_ONLY] +[default1]: [--eval-only EVAL_ONLY] +[default2]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default2]: [--hidden-size HIDDEN_SIZE] +[default2]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default3]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default3]: [--inference] +[default3]: [--abort-on-unmet-fused-kernel-constraints] +[default1]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default1]: [--inference] +[default1]: [--abort-on-unmet-fused-kernel-constraints] +[default1]: [--pp-partition-method PP_PARTITION_METHOD] +[default1]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default4]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default4]: [--hidden-size HIDDEN_SIZE] +[default4]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default4]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default4]: [--kv-channels KV_CHANNELS] +[default4]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default4]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default4]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default4]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default3]: [--pp-partition-method PP_PARTITION_METHOD] +[default3]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default3]: [--init-method-xavier-uniform] [--lr LR] +[default3]: [--lr-decay-style {constant,linear,cosine}] +[default0]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default3]: [--lr-decay-iters LR_DECAY_ITERS] +[default3]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default3]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default3]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default2]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default0]: [--hidden-size HIDDEN_SIZE] +[default0]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default2]: [--kv-channels KV_CHANNELS] +[default3]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default3]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default3]: [--warmup WARMUP] [--min-lr MIN_LR] +[default2]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default2]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default1]: [--init-method-xavier-uniform] [--lr LR] +[default0]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default3]: [--override-lr-scheduler] +[default3]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default3]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default0]: [--kv-channels KV_CHANNELS] +[default0]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default4]: [--sync-tp-duplicated-parameters] +[default1]: [--lr-decay-style {constant,linear,cosine}] +[default1]: [--lr-decay-iters LR_DECAY_ITERS] +[default4]: [--apply-residual-connection-post-layernorm] +[default4]: [--embed-layernorm] [--openai-gelu] +[default1]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default1]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default4]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default4]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default4]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default3]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default3]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default3]: [--loss-scale LOSS_SCALE] +[default3]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default1]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default1]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default2]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default2]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default2]: [--sync-tp-duplicated-parameters] +[default2]: [--apply-residual-connection-post-layernorm] +[default2]: [--embed-layernorm] [--openai-gelu] +[default3]: [--min-loss-scale MIN_LOSS_SCALE] +[default3]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default4]: [--kill-switch-path KILL_SWITCH_PATH] +[default4]: [--log-level {debug,info,warning,error,critical}] +[default4]: [--log-level-replica {debug,info,warning,error,critical}] +[default2]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default2]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default2]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default3]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default3]: [--no-query-key-layer-scaling] +[default3]: [--attention-softmax-in-fp32] +[default3]: [--accumulate-allreduce-grads-in-fp32] +[default3]: [--fp16-lm-cross-entropy] +[default1]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default1]: [--warmup WARMUP] [--min-lr MIN_LR] +[default0]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default0]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default0]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default0]: [--sync-tp-duplicated-parameters] +[default2]: [--kill-switch-path KILL_SWITCH_PATH] +[default2]: [--log-level {debug,info,warning,error,critical}] +[default2]: [--log-level-replica {debug,info,warning,error,critical}] +[default2]: [--attention-dropout ATTENTION_DROPOUT] +[default2]: [--hidden-dropout HIDDEN_DROPOUT] +[default0]: [--apply-residual-connection-post-layernorm] +[default3]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default0]: [--embed-layernorm] [--openai-gelu] +[default4]: [--attention-dropout ATTENTION_DROPOUT] +[default2]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default2]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default2]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default2]: [--micro-batch-size MICRO_BATCH_SIZE] +[default3]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default4]: [--hidden-dropout HIDDEN_DROPOUT] +[default2]: [--batch-size BATCH_SIZE] +[default1]: [--override-lr-scheduler] +[default1]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default1]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default2]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default2]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default2]: [--checkpoint-activations] +[default2]: [--distribute-checkpointed-activations] +[default0]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default1]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default2]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default2]: [--train-iters TRAIN_ITERS] +[default2]: [--train-samples TRAIN_SAMPLES] +[default1]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default0]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default2]: [--train-tokens TRAIN_TOKENS] +[default2]: [--log-interval LOG_INTERVAL] +[default3]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default3]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default3]: [--distributed-backend {nccl,gloo}] +[default3]: [--DDP-impl {local,torch}] +[default2]: [--exit-interval EXIT_INTERVAL] +[default0]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default0]: [--kill-switch-path KILL_SWITCH_PATH] +[default0]: [--log-level {debug,info,warning,error,critical}] +[default3]: [--use-contiguous-buffers-in-ddp] +[default0]: [--log-level-replica {debug,info,warning,error,critical}] +[default0]: [--attention-dropout ATTENTION_DROPOUT] +[default3]: [--no-scatter-gather-tensors-in-pipeline] +[default3]: [--local_rank LOCAL_RANK] +[default0]: [--hidden-dropout HIDDEN_DROPOUT] +[default0]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default3]: [--lazy-mpu-init LAZY_MPU_INIT] +[default3]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default0]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default0]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default3]: [--eval-interval EVAL_INTERVAL] +[default2]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default2]: [--tensorboard-dir TENSORBOARD_DIR] +[default2]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default2]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default2]: [--use-bnb-optimizer] +[default4]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default4]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default4]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default4]: [--micro-batch-size MICRO_BATCH_SIZE] +[default3]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default3]: [--split SPLIT] +[default3]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default2]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default3]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--batch-size BATCH_SIZE] +[default4]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default4]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default4]: [--checkpoint-activations] +[default4]: [--distribute-checkpointed-activations] +[default2]: [--eval-only EVAL_ONLY] +[default1]: [--loss-scale LOSS_SCALE] +[default3]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default1]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default1]: [--min-loss-scale MIN_LOSS_SCALE] +[default1]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default1]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default2]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default2]: [--inference] +[default3]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default3]: [--merge-file MERGE_FILE] +[default3]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default0]: [--micro-batch-size MICRO_BATCH_SIZE] +[default4]: [--train-iters TRAIN_ITERS] +[default4]: [--train-samples TRAIN_SAMPLES] +[default0]: [--batch-size BATCH_SIZE] +[default1]: [--no-query-key-layer-scaling] +[default4]: [--train-tokens TRAIN_TOKENS] +[default4]: [--log-interval LOG_INTERVAL] +[default4]: [--exit-interval EXIT_INTERVAL] +[default4]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default4]: [--tensorboard-dir TENSORBOARD_DIR] +[default4]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default3]: [--seq-length SEQ_LENGTH] +[default0]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default3]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default3]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default3]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default3]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default3]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default3]: [--num-workers NUM_WORKERS] +[default3]: [--valid-num-workers VALID_NUM_WORKERS] +[default3]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default3]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default4]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default1]: [--attention-softmax-in-fp32] +[default0]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default0]: [--checkpoint-activations] +[default0]: [--distribute-checkpointed-activations] +[default2]: [--abort-on-unmet-fused-kernel-constraints] +[default1]: [--accumulate-allreduce-grads-in-fp32] +[default1]: [--fp16-lm-cross-entropy] +[default2]: [--pp-partition-method PP_PARTITION_METHOD] +[default4]: [--use-bnb-optimizer] +[default4]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default4]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default1]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default1]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default1]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default1]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default3]: [--data-impl {lazy,cached,mmap,infer}] +[default3]: [--reset-position-ids] [--reset-attention-mask] +[default3]: [--eod-mask-loss] [--loss-on-targets-only] +[default1]: [--distributed-backend {nccl,gloo}] +[default1]: [--DDP-impl {local,torch}] +[default1]: [--use-contiguous-buffers-in-ddp] +[default1]: [--no-scatter-gather-tensors-in-pipeline] +[default3]: [--norm-target-loss] +[default1]: [--local_rank LOCAL_RANK] +[default1]: [--lazy-mpu-init LAZY_MPU_INIT] +[default4]: [--eval-only EVAL_ONLY] +[default4]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default4]: [--inference] +[default4]: [--abort-on-unmet-fused-kernel-constraints] +[default2]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default0]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default0]: [--train-iters TRAIN_ITERS] +[default3]: [--reweight-loss-based-on-position-frequency] +[default3]: [--noise-density NOISE_DENSITY] +[default3]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default3]: [--prefixlm] [--adlr-autoresume] +[default4]: [--pp-partition-method PP_PARTITION_METHOD] +[default4]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default4]: [--init-method-xavier-uniform] [--lr LR] +[default4]: [--lr-decay-style {constant,linear,cosine}] +[default4]: [--lr-decay-iters LR_DECAY_ITERS] +[default1]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default3]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default2]: [--init-method-xavier-uniform] [--lr LR] +[default2]: [--lr-decay-style {constant,linear,cosine}] +[default2]: [--lr-decay-iters LR_DECAY_ITERS] +[default4]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default4]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default4]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default4]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default4]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default4]: [--warmup WARMUP] [--min-lr MIN_LR] +[default4]: [--override-lr-scheduler] +[default4]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default0]: [--train-samples TRAIN_SAMPLES] +[default0]: [--train-tokens TRAIN_TOKENS] +[default1]: [--eval-interval EVAL_INTERVAL] +[default4]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default4]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default2]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default2]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default2]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default1]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default1]: [--split SPLIT] +[default3]: [--ict-head-size ICT_HEAD_SIZE] +[default4]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default4]: [--loss-scale LOSS_SCALE] +[default4]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default4]: [--min-loss-scale MIN_LOSS_SCALE] +[default4]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default0]: [--log-interval LOG_INTERVAL] +[default0]: [--exit-interval EXIT_INTERVAL] +[default0]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default3]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default3]: [--biencoder-shared-query-context-model] +[default3]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default3]: [--titles-data-path TITLES_DATA_PATH] +[default4]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default0]: [--tensorboard-dir TENSORBOARD_DIR] +[default3]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default3]: [--use-one-sent-docs] +[default3]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default3]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default3]: [--retriever-score-scaling] +[default1]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--block-data-path BLOCK_DATA_PATH] +[default3]: [--embedding-path EMBEDDING_PATH] +[default3]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default3]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default1]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default4]: [--no-query-key-layer-scaling] +[default4]: [--attention-softmax-in-fp32] +[default0]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default0]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default1]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--use-bnb-optimizer] +[default0]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default0]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default2]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default2]: [--warmup WARMUP] [--min-lr MIN_LR] +[default0]: [--eval-only EVAL_ONLY] +[default2]: [--override-lr-scheduler] +[default4]: [--accumulate-allreduce-grads-in-fp32] +[default4]: [--fp16-lm-cross-entropy] +[default4]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default4]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default4]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default1]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default4]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default2]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default2]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default4]: [--distributed-backend {nccl,gloo}] +[default4]: [--DDP-impl {local,torch}] +[default3]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default3]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default3]: [--log-params-norm] [--log-num-zeros-in-grad] +[default2]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default4]: [--use-contiguous-buffers-in-ddp] +[default4]: [--no-scatter-gather-tensors-in-pipeline] +[default4]: [--local_rank LOCAL_RANK] +[default4]: [--lazy-mpu-init LAZY_MPU_INIT] +[default4]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default4]: [--eval-interval EVAL_INTERVAL] +[default4]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default4]: [--split SPLIT] +[default4]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default4]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default2]: [--loss-scale LOSS_SCALE] +[default2]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default2]: [--min-loss-scale MIN_LOSS_SCALE] +[default0]: [--inference] +[default0]: [--abort-on-unmet-fused-kernel-constraints] +[default0]: [--pp-partition-method PP_PARTITION_METHOD] +[default1]: [--merge-file MERGE_FILE] +[default1]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default1]: [--seq-length SEQ_LENGTH] +[default0]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default0]: [--init-method-xavier-uniform] [--lr LR] +[default0]: [--lr-decay-style {constant,linear,cosine}] +[default0]: [--lr-decay-iters LR_DECAY_ITERS] +[default1]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default1]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default3]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default3]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default3]: [--log-timers-to-tensorboard] +[default3]: [--log-batch-size-to-tensorboard] +[default3]: [--no-log-learnig-rate-to-tensorboard] +[default3]: [--no-log-loss-scale-to-tensorboard] +[default3]: [--log-validation-ppl-to-tensorboard] +[default3]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default3]: [--zero-contigious-gradients] +[default0]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default0]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default3]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default3]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default3]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default0]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default0]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default2]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default0]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default0]: [--warmup WARMUP] [--min-lr MIN_LR] +[default1]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default3]: [--scattered-embeddings] [--split-transformers] +[default3]: [--memory-centric-tiled-linear] +[default3]: [--tile-factor TILE_FACTOR] +[default0]: [--override-lr-scheduler] +[default0]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default0]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default0]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default3]: [--deepspeed-activation-checkpointing] +[default0]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default2]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default4]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default3]: [--partition-activations] [--contigious-checkpointing] +[default3]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default4]: [--merge-file MERGE_FILE] +[default4]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default4]: [--seq-length SEQ_LENGTH] +[default4]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default4]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default4]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default1]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default1]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default0]: [--loss-scale LOSS_SCALE] +[default1]: [--num-workers NUM_WORKERS] +[default1]: [--valid-num-workers VALID_NUM_WORKERS] +[default1]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default1]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default1]: [--data-impl {lazy,cached,mmap,infer}] +[default2]: [--no-query-key-layer-scaling] +[default1]: [--reset-position-ids] [--reset-attention-mask] +[default1]: [--eod-mask-loss] [--loss-on-targets-only] +[default1]: [--norm-target-loss] +[default1]: [--reweight-loss-based-on-position-frequency] +[default1]: [--noise-density NOISE_DENSITY] +[default0]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default0]: [--min-loss-scale MIN_LOSS_SCALE] +[default0]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default3]: [--profile-backward] [--deepspeed] +[default3]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default3]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default2]: [--attention-softmax-in-fp32] +[default2]: [--accumulate-allreduce-grads-in-fp32] +[default1]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default3]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default2]: [--fp16-lm-cross-entropy] +[default4]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default4]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default4]: [--num-workers NUM_WORKERS] +[default4]: [--valid-num-workers VALID_NUM_WORKERS] +[default4]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default0]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default0]: [--no-query-key-layer-scaling] +[default0]: [--attention-softmax-in-fp32] +[default2]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default2]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default2]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default2]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default4]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default0]: [--accumulate-allreduce-grads-in-fp32] +[default0]: [--fp16-lm-cross-entropy] +[default0]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default1]: [--prefixlm] [--adlr-autoresume] +[default1]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default1]: [--ict-head-size ICT_HEAD_SIZE] +[default1]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default1]: [--biencoder-shared-query-context-model] +[default2]: [--distributed-backend {nccl,gloo}] +[default2]: [--DDP-impl {local,torch}] +[default0]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default1]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default1]: [--titles-data-path TITLES_DATA_PATH] +[default0]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default2]: [--use-contiguous-buffers-in-ddp] +[default4]: [--data-impl {lazy,cached,mmap,infer}] +[default1]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default4]: [--reset-position-ids] [--reset-attention-mask] +[default2]: [--no-scatter-gather-tensors-in-pipeline] +[default0]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default0]: [--distributed-backend {nccl,gloo}] +[default0]: [--DDP-impl {local,torch}] +[default0]: [--use-contiguous-buffers-in-ddp] +[default2]: [--local_rank LOCAL_RANK] +[default2]: [--lazy-mpu-init LAZY_MPU_INIT] +[default2]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default0]: [--no-scatter-gather-tensors-in-pipeline] +[default4]: [--eod-mask-loss] [--loss-on-targets-only] +[default4]: [--norm-target-loss] +[default4]: [--reweight-loss-based-on-position-frequency] +[default2]: [--eval-interval EVAL_INTERVAL] +[default1]: [--use-one-sent-docs] +[default1]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default1]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default1]: [--retriever-score-scaling] +[default2]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default2]: [--split SPLIT] +[default0]: [--local_rank LOCAL_RANK] +[default0]: [--lazy-mpu-init LAZY_MPU_INIT] +[default2]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--noise-density NOISE_DENSITY] +[default2]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--block-data-path BLOCK_DATA_PATH] +[default1]: [--embedding-path EMBEDDING_PATH] +[default1]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default0]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default0]: [--eval-interval EVAL_INTERVAL] +[default1]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default1]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default1]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default1]: [--log-params-norm] [--log-num-zeros-in-grad] +[default2]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default4]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default4]: [--prefixlm] [--adlr-autoresume] +[default4]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default4]: [--ict-head-size ICT_HEAD_SIZE] +[default2]: [--merge-file MERGE_FILE] +[default4]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default4]: [--biencoder-shared-query-context-model] +[default4]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default0]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default0]: [--split SPLIT] +[default0]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default1]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default1]: [--log-timers-to-tensorboard] +[default1]: [--log-batch-size-to-tensorboard] +[default4]: [--titles-data-path TITLES_DATA_PATH] +[default1]: [--no-log-learnig-rate-to-tensorboard] +[default0]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default4]: [--use-one-sent-docs] +[default4]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default4]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default1]: [--no-log-loss-scale-to-tensorboard] +[default0]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--log-validation-ppl-to-tensorboard] +[default1]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default1]: [--zero-contigious-gradients] +[default1]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default2]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default4]: [--retriever-score-scaling] +[default4]: [--block-data-path BLOCK_DATA_PATH] +[default4]: [--embedding-path EMBEDDING_PATH] +[default4]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default4]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default4]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default4]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default4]: [--log-params-norm] [--log-num-zeros-in-grad] +[default4]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default2]: [--seq-length SEQ_LENGTH] +[default4]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default4]: [--log-timers-to-tensorboard] +[default2]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default2]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default2]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default4]: [--log-batch-size-to-tensorboard] +[default2]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default2]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default1]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default2]: [--num-workers NUM_WORKERS] +[default0]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--valid-num-workers VALID_NUM_WORKERS] +[default2]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default2]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default0]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default2]: [--data-impl {lazy,cached,mmap,infer}] +[default2]: [--reset-position-ids] [--reset-attention-mask] +[default2]: [--eod-mask-loss] [--loss-on-targets-only] +[default2]: [--norm-target-loss] +[default1]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default0]: [--merge-file MERGE_FILE] +[default0]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default1]: [--scattered-embeddings] [--split-transformers] +[default1]: [--memory-centric-tiled-linear] +[default1]: [--tile-factor TILE_FACTOR] +[default0]: [--seq-length SEQ_LENGTH] +[default1]: [--deepspeed-activation-checkpointing] +[default1]: [--partition-activations] [--contigious-checkpointing] +[default1]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default0]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default1]: [--profile-backward] [--deepspeed] +[default1]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default1]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default0]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default4]: [--no-log-learnig-rate-to-tensorboard] +[default4]: [--no-log-loss-scale-to-tensorboard] +[default4]: [--log-validation-ppl-to-tensorboard] +[default0]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default4]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default4]: [--zero-contigious-gradients] +[default4]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default4]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default1]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default2]: [--reweight-loss-based-on-position-frequency] +[default4]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default4]: [--scattered-embeddings] [--split-transformers] +[default4]: [--memory-centric-tiled-linear] +[default0]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default0]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default2]: [--noise-density NOISE_DENSITY] +[default2]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default2]: [--prefixlm] [--adlr-autoresume] +[default4]: [--tile-factor TILE_FACTOR] +[default4]: [--deepspeed-activation-checkpointing] +[default2]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default2]: [--ict-head-size ICT_HEAD_SIZE] +[default0]: [--num-workers NUM_WORKERS] +[default0]: [--valid-num-workers VALID_NUM_WORKERS] +[default4]: [--partition-activations] [--contigious-checkpointing] +[default4]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default4]: [--profile-backward] [--deepspeed] +[default0]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default0]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default4]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default4]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default4]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default0]: [--data-impl {lazy,cached,mmap,infer}] +[default0]: [--reset-position-ids] [--reset-attention-mask] +[default2]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default2]: [--biencoder-shared-query-context-model] +[default0]: [--eod-mask-loss] [--loss-on-targets-only] +[default0]: [--norm-target-loss] +[default2]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default2]: [--titles-data-path TITLES_DATA_PATH] +[default2]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default0]: [--reweight-loss-based-on-position-frequency] +[default0]: [--noise-density NOISE_DENSITY] +[default2]: [--use-one-sent-docs] +[default2]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default2]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default0]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default0]: [--prefixlm] [--adlr-autoresume] +[default2]: [--retriever-score-scaling] +[default0]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default0]: [--ict-head-size ICT_HEAD_SIZE] +[default0]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default0]: [--biencoder-shared-query-context-model] +[default0]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default0]: [--titles-data-path TITLES_DATA_PATH] +[default0]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default0]: [--use-one-sent-docs] +[default2]: [--block-data-path BLOCK_DATA_PATH] +[default2]: [--embedding-path EMBEDDING_PATH] +[default2]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default2]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default0]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default0]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default0]: [--retriever-score-scaling] +[default2]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default2]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default2]: [--log-params-norm] [--log-num-zeros-in-grad] +[default2]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default2]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default0]: [--block-data-path BLOCK_DATA_PATH] +[default0]: [--embedding-path EMBEDDING_PATH] +[default2]: [--log-timers-to-tensorboard] +[default2]: [--log-batch-size-to-tensorboard] +[default2]: [--no-log-learnig-rate-to-tensorboard] +[default0]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default0]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default2]: [--no-log-loss-scale-to-tensorboard] +[default2]: [--log-validation-ppl-to-tensorboard] +[default2]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default2]: [--zero-contigious-gradients] +[default2]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default0]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default0]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default2]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default2]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default2]: [--scattered-embeddings] [--split-transformers] +[default2]: [--memory-centric-tiled-linear] +[default0]: [--log-params-norm] [--log-num-zeros-in-grad] +[default0]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default2]: [--tile-factor TILE_FACTOR] +[default2]: [--deepspeed-activation-checkpointing] +[default2]: [--partition-activations] [--contigious-checkpointing] +[default2]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default0]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default0]: [--log-timers-to-tensorboard] +[default2]: [--profile-backward] [--deepspeed] +[default0]: [--log-batch-size-to-tensorboard] +[default0]: [--no-log-learnig-rate-to-tensorboard] +[default2]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default0]: [--no-log-loss-scale-to-tensorboard] +[default0]: [--log-validation-ppl-to-tensorboard] +[default2]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default0]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default0]: [--zero-contigious-gradients] +[default2]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default0]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default0]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default0]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default0]: [--scattered-embeddings] [--split-transformers] +[default0]: [--memory-centric-tiled-linear] +[default0]: [--tile-factor TILE_FACTOR] +[default0]: [--deepspeed-activation-checkpointing] +[default0]: [--partition-activations] [--contigious-checkpointing] +[default0]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default0]: [--profile-backward] [--deepspeed] +[default0]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default0]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default0]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default0]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default0]: [--hidden-size HIDDEN_SIZE] +[default0]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default0]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default0]: [--kv-channels KV_CHANNELS] +[default0]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default0]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default0]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default0]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default0]: [--sync-tp-duplicated-parameters] +[default0]: [--apply-residual-connection-post-layernorm] +[default0]: [--embed-layernorm] [--openai-gelu] +[default0]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default0]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default0]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default0]: [--kill-switch-path KILL_SWITCH_PATH] +[default0]: [--log-level {debug,info,warning,error,critical}] +[default0]: [--log-level-replica {debug,info,warning,error,critical}] +[default0]: [--attention-dropout ATTENTION_DROPOUT] +[default0]: [--hidden-dropout HIDDEN_DROPOUT] +[default0]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default0]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default0]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default0]: [--micro-batch-size MICRO_BATCH_SIZE] +[default0]: [--batch-size BATCH_SIZE] +[default0]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default0]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default0]: [--checkpoint-activations] +[default0]: [--distribute-checkpointed-activations] +[default0]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default0]: [--train-iters TRAIN_ITERS] +[default0]: [--train-samples TRAIN_SAMPLES] +[default0]: [--train-tokens TRAIN_TOKENS] +[default0]: [--log-interval LOG_INTERVAL] +[default0]: [--exit-interval EXIT_INTERVAL] +[default0]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default0]: [--tensorboard-dir TENSORBOARD_DIR] +[default0]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default0]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default0]: [--use-bnb-optimizer] +[default0]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default0]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default0]: [--eval-only EVAL_ONLY] +[default0]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default0]: [--inference] +[default0]: [--abort-on-unmet-fused-kernel-constraints] +[default0]: [--pp-partition-method PP_PARTITION_METHOD] +[default0]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default0]: [--init-method-xavier-uniform] [--lr LR] +[default0]: [--lr-decay-style {constant,linear,cosine}] +[default0]: [--lr-decay-iters LR_DECAY_ITERS] +[default0]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default0]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default0]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default0]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default0]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default0]: [--warmup WARMUP] [--min-lr MIN_LR] +[default0]: [--override-lr-scheduler] +[default0]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default0]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default0]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default0]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default1]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default1]: [--hidden-size HIDDEN_SIZE] +[default1]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default1]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default1]: [--kv-channels KV_CHANNELS] +[default0]: [--loss-scale LOSS_SCALE] +[default0]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default0]: [--min-loss-scale MIN_LOSS_SCALE] +[default1]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default1]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default1]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default0]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default0]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default0]: [--no-query-key-layer-scaling] +[default1]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default1]: [--sync-tp-duplicated-parameters] +[default1]: [--apply-residual-connection-post-layernorm] +[default0]: [--attention-softmax-in-fp32] +[default0]: [--accumulate-allreduce-grads-in-fp32] +[default0]: [--fp16-lm-cross-entropy] +[default0]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default1]: [--embed-layernorm] [--openai-gelu] +[default1]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default1]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default0]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default0]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default0]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default0]: [--distributed-backend {nccl,gloo}] +[default1]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default1]: [--kill-switch-path KILL_SWITCH_PATH] +[default1]: [--log-level {debug,info,warning,error,critical}] +[default1]: [--log-level-replica {debug,info,warning,error,critical}] +[default0]: [--DDP-impl {local,torch}] +[default0]: [--use-contiguous-buffers-in-ddp] +[default0]: [--no-scatter-gather-tensors-in-pipeline] +[default1]: [--attention-dropout ATTENTION_DROPOUT] +[default1]: [--hidden-dropout HIDDEN_DROPOUT] +[default1]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default1]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default0]: [--local_rank LOCAL_RANK] +[default0]: [--lazy-mpu-init LAZY_MPU_INIT] +[default0]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default1]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default1]: [--micro-batch-size MICRO_BATCH_SIZE] +[default1]: [--batch-size BATCH_SIZE] +[default1]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default1]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default1]: [--checkpoint-activations] +[default1]: [--distribute-checkpointed-activations] +[default1]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default0]: [--eval-interval EVAL_INTERVAL] +[default0]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default0]: [--split SPLIT] +[default1]: [--train-iters TRAIN_ITERS] +[default1]: [--train-samples TRAIN_SAMPLES] +[default1]: [--train-tokens TRAIN_TOKENS] +[default1]: [--log-interval LOG_INTERVAL] +[default1]: [--exit-interval EXIT_INTERVAL] +[default0]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default2]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default2]: [--hidden-size HIDDEN_SIZE] +[default2]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default2]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default0]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default1]: [--tensorboard-dir TENSORBOARD_DIR] +[default1]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default1]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default0]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--use-bnb-optimizer] +[default1]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default0]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default0]: [--merge-file MERGE_FILE] +[default2]: [--kv-channels KV_CHANNELS] +[default2]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default2]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default2]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default2]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default0]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default0]: [--seq-length SEQ_LENGTH] +[default0]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default0]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default0]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default0]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default0]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default0]: [--num-workers NUM_WORKERS] +[default3]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default3]: [--hidden-size HIDDEN_SIZE] +[default3]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default0]: [--valid-num-workers VALID_NUM_WORKERS] +[default0]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default3]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default3]: [--kv-channels KV_CHANNELS] +[default3]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default3]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default3]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default3]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default2]: [--sync-tp-duplicated-parameters] +[default2]: [--apply-residual-connection-post-layernorm] +[default2]: [--embed-layernorm] [--openai-gelu] +[default3]: [--sync-tp-duplicated-parameters] +[default0]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default0]: [--data-impl {lazy,cached,mmap,infer}] +[default0]: [--reset-position-ids] [--reset-attention-mask] +[default3]: [--apply-residual-connection-post-layernorm] +[default1]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default1]: [--eval-only EVAL_ONLY] +[default3]: [--embed-layernorm] [--openai-gelu] +[default2]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default2]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default3]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default3]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default3]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default3]: [--kill-switch-path KILL_SWITCH_PATH] +[default3]: [--log-level {debug,info,warning,error,critical}] +[default2]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default2]: [--kill-switch-path KILL_SWITCH_PATH] +[default1]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default2]: [--log-level {debug,info,warning,error,critical}] +[default2]: [--log-level-replica {debug,info,warning,error,critical}] +[default0]: [--eod-mask-loss] [--loss-on-targets-only] +[default3]: [--log-level-replica {debug,info,warning,error,critical}] +[default3]: [--attention-dropout ATTENTION_DROPOUT] +[default0]: [--norm-target-loss] +[default0]: [--reweight-loss-based-on-position-frequency] +[default0]: [--noise-density NOISE_DENSITY] +[default2]: [--attention-dropout ATTENTION_DROPOUT] +[default0]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default1]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default0]: [--hidden-size HIDDEN_SIZE] +[default0]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default0]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default0]: [--kv-channels KV_CHANNELS] +[default3]: [--hidden-dropout HIDDEN_DROPOUT] +[default3]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default3]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default0]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default3]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default2]: [--hidden-dropout HIDDEN_DROPOUT] +[default1]: [--hidden-size HIDDEN_SIZE] +[default1]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default0]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default0]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default0]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default1]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default1]: [--kv-channels KV_CHANNELS] +[default0]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default0]: [--sync-tp-duplicated-parameters] +[default0]: [--apply-residual-connection-post-layernorm] +[default1]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default1]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default0]: [--embed-layernorm] [--openai-gelu] +[default2]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default3]: [--micro-batch-size MICRO_BATCH_SIZE] +[default1]: [--inference] +[default2]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default0]: [--prefixlm] [--adlr-autoresume] +[default0]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default0]: [--ict-head-size ICT_HEAD_SIZE] +[default2]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default2]: [--micro-batch-size MICRO_BATCH_SIZE] +[default0]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default0]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default1]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default3]: [--batch-size BATCH_SIZE] +[default3]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default0]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default0]: [--biencoder-shared-query-context-model] +[default0]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default1]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default0]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default0]: [--kill-switch-path KILL_SWITCH_PATH] +[default0]: [--log-level {debug,info,warning,error,critical}] +[default0]: [--log-level-replica {debug,info,warning,error,critical}] +[default1]: [--sync-tp-duplicated-parameters] +[default1]: [--apply-residual-connection-post-layernorm] +[default1]: [--embed-layernorm] [--openai-gelu] +[default0]: [--attention-dropout ATTENTION_DROPOUT] +[default0]: [--hidden-dropout HIDDEN_DROPOUT] +[default0]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default1]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default0]: [--titles-data-path TITLES_DATA_PATH] +[default2]: [--batch-size BATCH_SIZE] +[default1]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default1]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default0]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default0]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default0]: [--micro-batch-size MICRO_BATCH_SIZE] +[default0]: [--batch-size BATCH_SIZE] +[default0]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default0]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default1]: [--kill-switch-path KILL_SWITCH_PATH] +[default1]: [--log-level {debug,info,warning,error,critical}] +[default0]: [--checkpoint-activations] +[default0]: [--distribute-checkpointed-activations] +[default0]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default0]: [--train-iters TRAIN_ITERS] +[default1]: [--log-level-replica {debug,info,warning,error,critical}] +[default1]: [--attention-dropout ATTENTION_DROPOUT] +[default0]: [--train-samples TRAIN_SAMPLES] +[default1]: [--abort-on-unmet-fused-kernel-constraints] +[default1]: [--hidden-dropout HIDDEN_DROPOUT] +[default0]: [--train-tokens TRAIN_TOKENS] +[default1]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default1]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default0]: [--log-interval LOG_INTERVAL] +[default0]: [--exit-interval EXIT_INTERVAL] +[default0]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default1]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default1]: [--micro-batch-size MICRO_BATCH_SIZE] +[default1]: [--batch-size BATCH_SIZE] +[default0]: [--tensorboard-dir TENSORBOARD_DIR] +[default0]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default0]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default1]: [--pp-partition-method PP_PARTITION_METHOD] +[default1]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default1]: [--init-method-xavier-uniform] [--lr LR] +[default2]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default1]: [--lr-decay-style {constant,linear,cosine}] +[default1]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default1]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default1]: [--checkpoint-activations] +[default1]: [--distribute-checkpointed-activations] +[default1]: [--lr-decay-iters LR_DECAY_ITERS] +[default0]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default0]: [--use-bnb-optimizer] +[default0]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default0]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default0]: [--eval-only EVAL_ONLY] +[default1]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default1]: [--train-iters TRAIN_ITERS] +[default0]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default1]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default1]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default0]: [--use-one-sent-docs] +[default0]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default0]: [--inference] +[default1]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default1]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default3]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default3]: [--checkpoint-activations] +[default1]: [--train-samples TRAIN_SAMPLES] +[default1]: [--train-tokens TRAIN_TOKENS] +[default3]: [--distribute-checkpointed-activations] +[default3]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default1]: [--log-interval LOG_INTERVAL] +[default1]: [--exit-interval EXIT_INTERVAL] +[default0]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default0]: [--abort-on-unmet-fused-kernel-constraints] +[default0]: [--pp-partition-method PP_PARTITION_METHOD] +[default1]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default2]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default2]: [--checkpoint-activations] +[default0]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default0]: [--init-method-xavier-uniform] [--lr LR] +[default0]: [--lr-decay-style {constant,linear,cosine}] +[default2]: [--distribute-checkpointed-activations] +[default2]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default2]: [--train-iters TRAIN_ITERS] +[default0]: [--retriever-score-scaling] +[default2]: [--train-samples TRAIN_SAMPLES] +[default2]: [--train-tokens TRAIN_TOKENS] +[default1]: [--tensorboard-dir TENSORBOARD_DIR] +[default1]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default1]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default1]: [--use-bnb-optimizer] +[default1]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default1]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default3]: [--train-iters TRAIN_ITERS] +[default1]: [--eval-only EVAL_ONLY] +[default1]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default1]: [--inference] +[default1]: [--abort-on-unmet-fused-kernel-constraints] +[default1]: [--pp-partition-method PP_PARTITION_METHOD] +[default1]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default1]: [--init-method-xavier-uniform] [--lr LR] +[default1]: [--lr-decay-style {constant,linear,cosine}] +[default1]: [--lr-decay-iters LR_DECAY_ITERS] +[default1]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default1]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default1]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default1]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default3]: [--train-samples TRAIN_SAMPLES] +[default2]: [--log-interval LOG_INTERVAL] +[default2]: [--exit-interval EXIT_INTERVAL] +[default2]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default1]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default1]: [--warmup WARMUP] [--min-lr MIN_LR] +[default1]: [--override-lr-scheduler] +[default1]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default1]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default1]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default1]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default1]: [--loss-scale LOSS_SCALE] +[default1]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default1]: [--min-loss-scale MIN_LOSS_SCALE] +[default1]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default1]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default1]: [--no-query-key-layer-scaling] +[default1]: [--attention-softmax-in-fp32] +[default1]: [--accumulate-allreduce-grads-in-fp32] +[default1]: [--fp16-lm-cross-entropy] +[default1]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default1]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default1]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default1]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default1]: [--distributed-backend {nccl,gloo}] +[default1]: [--DDP-impl {local,torch}] +[default1]: [--use-contiguous-buffers-in-ddp] +[default1]: [--no-scatter-gather-tensors-in-pipeline] +[default1]: [--local_rank LOCAL_RANK] +[default1]: [--lazy-mpu-init LAZY_MPU_INIT] +[default1]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default1]: [--eval-interval EVAL_INTERVAL] +[default1]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default2]: [--tensorboard-dir TENSORBOARD_DIR] +[default2]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default2]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default2]: [--use-bnb-optimizer] +[default1]: [--split SPLIT] +[default1]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--train-tokens TRAIN_TOKENS] +[default3]: [--log-interval LOG_INTERVAL] +[default1]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default1]: [--merge-file MERGE_FILE] +[default1]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default1]: [--seq-length SEQ_LENGTH] +[default1]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default1]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default1]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default1]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default2]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default2]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default1]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default1]: [--num-workers NUM_WORKERS] +[default1]: [--valid-num-workers VALID_NUM_WORKERS] +[default1]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default1]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default1]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default1]: [--data-impl {lazy,cached,mmap,infer}] +[default1]: [--reset-position-ids] [--reset-attention-mask] +[default1]: [--eod-mask-loss] [--loss-on-targets-only] +[default1]: [--norm-target-loss] +[default1]: [--reweight-loss-based-on-position-frequency] +[default1]: [--noise-density NOISE_DENSITY] +[default1]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default1]: [--prefixlm] [--adlr-autoresume] +[default1]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default1]: [--ict-head-size ICT_HEAD_SIZE] +[default1]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default1]: [--biencoder-shared-query-context-model] +[default1]: [--warmup WARMUP] [--min-lr MIN_LR] +[default1]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default1]: [--titles-data-path TITLES_DATA_PATH] +[default1]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default1]: [--use-one-sent-docs] +[default1]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default1]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default1]: [--retriever-score-scaling] +[default0]: [--lr-decay-iters LR_DECAY_ITERS] +[default0]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default0]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default0]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default0]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default1]: [--block-data-path BLOCK_DATA_PATH] +[default1]: [--embedding-path EMBEDDING_PATH] +[default1]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default1]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default0]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default0]: [--warmup WARMUP] [--min-lr MIN_LR] +[default1]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default0]: [--override-lr-scheduler] +[default0]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default0]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default1]: [--override-lr-scheduler] +[default1]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default1]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default1]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default2]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default2]: [--eval-only EVAL_ONLY] +[default2]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default1]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default2]: [--inference] +[default2]: [--hidden-size HIDDEN_SIZE] +[default2]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default2]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default3]: [--exit-interval EXIT_INTERVAL] +[default2]: [--abort-on-unmet-fused-kernel-constraints] +[default0]: [--block-data-path BLOCK_DATA_PATH] +[default0]: [--embedding-path EMBEDDING_PATH] +[default3]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default0]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default0]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default2]: [--kv-channels KV_CHANNELS] +[default3]: [--tensorboard-dir TENSORBOARD_DIR] +[default0]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default1]: [--loss-scale LOSS_SCALE] +[default1]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default1]: [--min-loss-scale MIN_LOSS_SCALE] +[default0]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default1]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default1]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default2]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default3]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default1]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default1]: [--no-query-key-layer-scaling] +[default1]: [--attention-softmax-in-fp32] +[default2]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default2]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default0]: [--loss-scale LOSS_SCALE] +[default1]: [--log-params-norm] [--log-num-zeros-in-grad] +[default1]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default0]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default0]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default1]: [--accumulate-allreduce-grads-in-fp32] +[default2]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default2]: [--sync-tp-duplicated-parameters] +[default1]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default1]: [--log-timers-to-tensorboard] +[default2]: [--apply-residual-connection-post-layernorm] +[default0]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default0]: [--min-loss-scale MIN_LOSS_SCALE] +[default2]: [--embed-layernorm] [--openai-gelu] +[default2]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default0]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default1]: [--fp16-lm-cross-entropy] +[default0]: [--log-params-norm] [--log-num-zeros-in-grad] +[default1]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default1]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default1]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default2]: [--pp-partition-method PP_PARTITION_METHOD] +[default2]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default1]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default1]: [--distributed-backend {nccl,gloo}] +[default0]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default0]: [--no-query-key-layer-scaling] +[default2]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default2]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default2]: [--kill-switch-path KILL_SWITCH_PATH] +[default1]: [--log-batch-size-to-tensorboard] +[default1]: [--no-log-learnig-rate-to-tensorboard] +[default1]: [--no-log-loss-scale-to-tensorboard] +[default1]: [--DDP-impl {local,torch}] +[default1]: [--use-contiguous-buffers-in-ddp] +[default0]: [--attention-softmax-in-fp32] +[default0]: [--accumulate-allreduce-grads-in-fp32] +[default2]: [--log-level {debug,info,warning,error,critical}] +[default1]: [--log-validation-ppl-to-tensorboard] +[default1]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default1]: [--no-scatter-gather-tensors-in-pipeline] +[default3]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default3]: [--use-bnb-optimizer] +[default3]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default0]: [--fp16-lm-cross-entropy] +[default1]: [--zero-contigious-gradients] +[default2]: [--log-level-replica {debug,info,warning,error,critical}] +[default0]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default0]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default2]: [--attention-dropout ATTENTION_DROPOUT] +[default2]: [--hidden-dropout HIDDEN_DROPOUT] +[default1]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default1]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default0]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default1]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default1]: [--scattered-embeddings] [--split-transformers] +[default2]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default2]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default1]: [--memory-centric-tiled-linear] +[default0]: [--log-timers-to-tensorboard] +[default2]: [--init-method-xavier-uniform] [--lr LR] +[default3]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default3]: [--eval-only EVAL_ONLY] +[default3]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default3]: [--inference] +[default3]: [--abort-on-unmet-fused-kernel-constraints] +[default3]: [--pp-partition-method PP_PARTITION_METHOD] +[default0]: [--log-batch-size-to-tensorboard] +[default1]: [--tile-factor TILE_FACTOR] +[default0]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default0]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default1]: [--deepspeed-activation-checkpointing] +[default1]: [--partition-activations] [--contigious-checkpointing] +[default2]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default0]: [--no-log-learnig-rate-to-tensorboard] +[default1]: [--local_rank LOCAL_RANK] +[default2]: [--lr-decay-style {constant,linear,cosine}] +[default2]: [--lr-decay-iters LR_DECAY_ITERS] +[default0]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default0]: [--distributed-backend {nccl,gloo}] +[default2]: [--micro-batch-size MICRO_BATCH_SIZE] +[default2]: [--batch-size BATCH_SIZE] +[default2]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default2]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default2]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default0]: [--DDP-impl {local,torch}] +[default1]: [--lazy-mpu-init LAZY_MPU_INIT] +[default1]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default0]: [--use-contiguous-buffers-in-ddp] +[default1]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default1]: [--profile-backward] [--deepspeed] +[default0]: [--no-scatter-gather-tensors-in-pipeline] +[default0]: [--local_rank LOCAL_RANK] +[default0]: [--no-log-loss-scale-to-tensorboard] +[default0]: [--log-validation-ppl-to-tensorboard] +[default0]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default1]: [--eval-interval EVAL_INTERVAL] +[default1]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default1]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default1]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default1]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default2]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default2]: [--checkpoint-activations] +[default2]: [--distribute-checkpointed-activations] +[default2]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default0]: [--zero-contigious-gradients] +[default1]: [--split SPLIT] +[default0]: [--lazy-mpu-init LAZY_MPU_INIT] +[default0]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default2]: [--train-iters TRAIN_ITERS] +[default2]: [--train-samples TRAIN_SAMPLES] +[default2]: [--train-tokens TRAIN_TOKENS] +[default2]: [--log-interval LOG_INTERVAL] +[default0]: [--eval-interval EVAL_INTERVAL] +[default0]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default2]: [--exit-interval EXIT_INTERVAL] +[default2]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default2]: [--tensorboard-dir TENSORBOARD_DIR] +[default2]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default0]: [--split SPLIT] +[default1]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default0]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default3]: [--init-method-xavier-uniform] [--lr LR] +[default3]: [--lr-decay-style {constant,linear,cosine}] +[default3]: [--lr-decay-iters LR_DECAY_ITERS] +[default0]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default0]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default0]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default0]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--use-bnb-optimizer] +[default0]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default2]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default2]: [--eval-only EVAL_ONLY] +[default2]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default0]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default2]: [--inference] +[default2]: [--abort-on-unmet-fused-kernel-constraints] +[default2]: [--pp-partition-method PP_PARTITION_METHOD] +[default2]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default0]: [--merge-file MERGE_FILE] +[default0]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default2]: [--init-method-xavier-uniform] [--lr LR] +[default3]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default3]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default3]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default3]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default1]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--lr-decay-style {constant,linear,cosine}] +[default2]: [--lr-decay-iters LR_DECAY_ITERS] +[default2]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default0]: [--seq-length SEQ_LENGTH] +[default0]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default2]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default2]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default2]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default2]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default0]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default0]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default2]: [--warmup WARMUP] [--min-lr MIN_LR] +[default0]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default0]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default2]: [--override-lr-scheduler] +[default2]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default2]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default2]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default2]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default2]: [--loss-scale LOSS_SCALE] +[default2]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default0]: [--num-workers NUM_WORKERS] +[default0]: [--valid-num-workers VALID_NUM_WORKERS] +[default2]: [--min-loss-scale MIN_LOSS_SCALE] +[default2]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default2]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default2]: [--no-query-key-layer-scaling] +[default0]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default0]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default2]: [--attention-softmax-in-fp32] +[default2]: [--accumulate-allreduce-grads-in-fp32] +[default2]: [--fp16-lm-cross-entropy] +[default2]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default0]: [--data-impl {lazy,cached,mmap,infer}] +[default0]: [--reset-position-ids] [--reset-attention-mask] +[default0]: [--eod-mask-loss] [--loss-on-targets-only] +[default0]: [--norm-target-loss] +[default0]: [--reweight-loss-based-on-position-frequency] +[default2]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default0]: [--noise-density NOISE_DENSITY] +[default0]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default2]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default0]: [--prefixlm] [--adlr-autoresume] +[default0]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default2]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default2]: [--distributed-backend {nccl,gloo}] +[default2]: [--DDP-impl {local,torch}] +[default2]: [--use-contiguous-buffers-in-ddp] +[default0]: [--ict-head-size ICT_HEAD_SIZE] +[default0]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default0]: [--biencoder-shared-query-context-model] +[default2]: [--no-scatter-gather-tensors-in-pipeline] +[default2]: [--local_rank LOCAL_RANK] +[default2]: [--lazy-mpu-init LAZY_MPU_INIT] +[default2]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default2]: [--eval-interval EVAL_INTERVAL] +[default0]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default0]: [--titles-data-path TITLES_DATA_PATH] +[default0]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default2]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default2]: [--split SPLIT] +[default2]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--use-one-sent-docs] +[default0]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default2]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default4]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default1]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--hidden-size HIDDEN_SIZE] +[default4]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default4]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default4]: [--kv-channels KV_CHANNELS] +[default2]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default0]: [--retriever-score-scaling] +[default2]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default2]: [--merge-file MERGE_FILE] +[default2]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default2]: [--seq-length SEQ_LENGTH] +[default4]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default0]: [--scattered-embeddings] [--split-transformers] +[default0]: [--memory-centric-tiled-linear] +[default0]: [--tile-factor TILE_FACTOR] +[default0]: [--deepspeed-activation-checkpointing] +[default2]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default0]: [--block-data-path BLOCK_DATA_PATH] +[default0]: [--embedding-path EMBEDDING_PATH] +[default2]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default2]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default2]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default2]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default0]: [--partition-activations] [--contigious-checkpointing] +[default2]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default2]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default2]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default2]: [--warmup WARMUP] [--min-lr MIN_LR] +[default2]: [--override-lr-scheduler] +[default3]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default0]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default0]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default2]: [--num-workers NUM_WORKERS] +[default2]: [--valid-num-workers VALID_NUM_WORKERS] +[default2]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default2]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default4]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default0]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default0]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default0]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default2]: [--data-impl {lazy,cached,mmap,infer}] +[default2]: [--reset-position-ids] [--reset-attention-mask] +[default2]: [--eod-mask-loss] [--loss-on-targets-only] +[default2]: [--norm-target-loss] +[default0]: [--profile-backward] [--deepspeed] +[default0]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default1]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--log-params-norm] [--log-num-zeros-in-grad] +[default0]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default2]: [--reweight-loss-based-on-position-frequency] +[default2]: [--noise-density NOISE_DENSITY] +[default2]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default2]: [--prefixlm] [--adlr-autoresume] +[default0]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default1]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default1]: [--merge-file MERGE_FILE] +[default0]: [--log-timers-to-tensorboard] +[default2]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default0]: [--log-batch-size-to-tensorboard] +[default0]: [--no-log-learnig-rate-to-tensorboard] +[default2]: [--ict-head-size ICT_HEAD_SIZE] +[default1]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default2]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default2]: [--biencoder-shared-query-context-model] +[default2]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default0]: [--no-log-loss-scale-to-tensorboard] +[default0]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default0]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default0]: [--log-validation-ppl-to-tensorboard] +[default2]: [--titles-data-path TITLES_DATA_PATH] +[default0]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default0]: [--zero-contigious-gradients] +[default0]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default0]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default2]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default2]: [--use-one-sent-docs] +[default2]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default2]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default0]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default0]: [--scattered-embeddings] [--split-transformers] +[default2]: [--retriever-score-scaling] +[default0]: [--memory-centric-tiled-linear] +[default0]: [--tile-factor TILE_FACTOR] +[default0]: [--deepspeed-activation-checkpointing] +[default2]: [--block-data-path BLOCK_DATA_PATH] +[default2]: [--embedding-path EMBEDDING_PATH] +[default2]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default2]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default0]: [--partition-activations] [--contigious-checkpointing] +[default2]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default4]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default0]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default2]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default2]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default2]: [--log-params-norm] [--log-num-zeros-in-grad] +[default2]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default2]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default0]: [--profile-backward] [--deepspeed] +[default0]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default2]: [--log-timers-to-tensorboard] +[default2]: [--log-batch-size-to-tensorboard] +[default2]: [--no-log-learnig-rate-to-tensorboard] +[default4]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default2]: [--no-log-loss-scale-to-tensorboard] +[default0]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default0]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default2]: [--log-validation-ppl-to-tensorboard] +[default2]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default2]: [--zero-contigious-gradients] +[default2]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default2]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default2]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default2]: [--scattered-embeddings] [--split-transformers] +[default2]: [--memory-centric-tiled-linear] +[default2]: [--tile-factor TILE_FACTOR] +[default2]: [--deepspeed-activation-checkpointing] +[default2]: [--partition-activations] [--contigious-checkpointing] +[default2]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default2]: [--profile-backward] [--deepspeed] +[default2]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default2]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default2]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default4]: [--sync-tp-duplicated-parameters] +[default3]: [--warmup WARMUP] [--min-lr MIN_LR] +[default3]: [--override-lr-scheduler] +[default3]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default2]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default2]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default4]: [--apply-residual-connection-post-layernorm] +[default4]: [--embed-layernorm] [--openai-gelu] +[default4]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default2]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default4]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default4]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default4]: [--kill-switch-path KILL_SWITCH_PATH] +[default4]: [--log-level {debug,info,warning,error,critical}] +[default1]: [--seq-length SEQ_LENGTH] +[default1]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default1]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default4]: [--log-level-replica {debug,info,warning,error,critical}] +[default3]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default3]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default3]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default4]: [--attention-dropout ATTENTION_DROPOUT] +[default4]: [--hidden-dropout HIDDEN_DROPOUT] +[default4]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default4]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default4]: [--hidden-size HIDDEN_SIZE] +[default4]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default1]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default1]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default1]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default3]: [--loss-scale LOSS_SCALE] +[default2]: [--loss-scale LOSS_SCALE] +[default4]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default4]: [--kv-channels KV_CHANNELS] +[default2]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default2]: [--min-loss-scale MIN_LOSS_SCALE] +[default4]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default4]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default4]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default4]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default4]: [--sync-tp-duplicated-parameters] +[default4]: [--apply-residual-connection-post-layernorm] +[default4]: [--embed-layernorm] [--openai-gelu] +[default4]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default4]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default4]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default2]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default4]: [--kill-switch-path KILL_SWITCH_PATH] +[default4]: [--log-level {debug,info,warning,error,critical}] +[default4]: [--log-level-replica {debug,info,warning,error,critical}] +[default4]: [--attention-dropout ATTENTION_DROPOUT] +[default4]: [--hidden-dropout HIDDEN_DROPOUT] +[default4]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default4]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default4]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default4]: [--micro-batch-size MICRO_BATCH_SIZE] +[default4]: [--batch-size BATCH_SIZE] +[default4]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default4]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default4]: [--checkpoint-activations] +[default4]: [--distribute-checkpointed-activations] +[default4]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default4]: [--train-iters TRAIN_ITERS] +[default4]: [--train-samples TRAIN_SAMPLES] +[default4]: [--train-tokens TRAIN_TOKENS] +[default4]: [--log-interval LOG_INTERVAL] +[default4]: [--exit-interval EXIT_INTERVAL] +[default4]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default4]: [--tensorboard-dir TENSORBOARD_DIR] +[default4]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default4]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default4]: [--use-bnb-optimizer] +[default4]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default4]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default4]: [--eval-only EVAL_ONLY] +[default3]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default4]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default4]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default1]: [--num-workers NUM_WORKERS] +[default4]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default4]: [--inference] +[default4]: [--abort-on-unmet-fused-kernel-constraints] +[default4]: [--pp-partition-method PP_PARTITION_METHOD] +[default4]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default4]: [--init-method-xavier-uniform] [--lr LR] +[default4]: [--lr-decay-style {constant,linear,cosine}] +[default4]: [--lr-decay-iters LR_DECAY_ITERS] +[default4]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default4]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default4]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default4]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default4]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default4]: [--warmup WARMUP] [--min-lr MIN_LR] +[default4]: [--override-lr-scheduler] +[default4]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default4]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default4]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default4]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default4]: [--loss-scale LOSS_SCALE] +[default4]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default2]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default2]: [--no-query-key-layer-scaling] +[default2]: [--attention-softmax-in-fp32] +[default1]: [--valid-num-workers VALID_NUM_WORKERS] +[default4]: [--min-loss-scale MIN_LOSS_SCALE] +[default4]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default4]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default4]: [--no-query-key-layer-scaling] +[default4]: [--attention-softmax-in-fp32] +[default4]: [--accumulate-allreduce-grads-in-fp32] +[default4]: [--fp16-lm-cross-entropy] +[default4]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default4]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default4]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default4]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default4]: [--distributed-backend {nccl,gloo}] +[default4]: [--DDP-impl {local,torch}] +[default4]: [--use-contiguous-buffers-in-ddp] +[default4]: [--no-scatter-gather-tensors-in-pipeline] +[default4]: [--local_rank LOCAL_RANK] +[default4]: [--lazy-mpu-init LAZY_MPU_INIT] +[default3]: [--min-loss-scale MIN_LOSS_SCALE] +[default2]: [--accumulate-allreduce-grads-in-fp32] +[default2]: [--fp16-lm-cross-entropy] +[default1]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default3]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default3]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default4]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default4]: [--eval-interval EVAL_INTERVAL] +[default4]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default4]: [--split SPLIT] +[default4]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default4]: [--merge-file MERGE_FILE] +[default4]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default4]: [--seq-length SEQ_LENGTH] +[default4]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default4]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default4]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default4]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default4]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default4]: [--num-workers NUM_WORKERS] +[default4]: [--valid-num-workers VALID_NUM_WORKERS] +[default4]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default4]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default4]: [--data-impl {lazy,cached,mmap,infer}] +[default4]: [--reset-position-ids] [--reset-attention-mask] +[default4]: [--eod-mask-loss] [--loss-on-targets-only] +[default4]: [--norm-target-loss] +[default4]: [--reweight-loss-based-on-position-frequency] +[default4]: [--noise-density NOISE_DENSITY] +[default4]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default4]: [--prefixlm] [--adlr-autoresume] +[default4]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default4]: [--ict-head-size ICT_HEAD_SIZE] +[default4]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default4]: [--biencoder-shared-query-context-model] +[default4]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default4]: [--titles-data-path TITLES_DATA_PATH] +[default4]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default4]: [--use-one-sent-docs] +[default4]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default3]: [--no-query-key-layer-scaling] +[default3]: [--attention-softmax-in-fp32] +[default4]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default4]: [--retriever-score-scaling] +[default3]: [--accumulate-allreduce-grads-in-fp32] +[default4]: [--block-data-path BLOCK_DATA_PATH] +[default2]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default2]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default3]: [--fp16-lm-cross-entropy] +[default4]: [--micro-batch-size MICRO_BATCH_SIZE] +[default4]: [--batch-size BATCH_SIZE] +[default1]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default1]: [--data-impl {lazy,cached,mmap,infer}] +[default1]: [--reset-position-ids] [--reset-attention-mask] +[default3]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default3]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default3]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default3]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default3]: [--distributed-backend {nccl,gloo}] +[default2]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default4]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default4]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default4]: [--checkpoint-activations] +[default4]: [--distribute-checkpointed-activations] +[default4]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default4]: [--train-iters TRAIN_ITERS] +[default4]: [--train-samples TRAIN_SAMPLES] +[default4]: [--train-tokens TRAIN_TOKENS] +[default4]: [--log-interval LOG_INTERVAL] +[default4]: [--exit-interval EXIT_INTERVAL] +[default4]: [--embedding-path EMBEDDING_PATH] +[default4]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default4]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default4]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default4]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default4]: [--log-params-norm] [--log-num-zeros-in-grad] +[default4]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default4]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default4]: [--log-timers-to-tensorboard] +[default4]: [--log-batch-size-to-tensorboard] +[default4]: [--no-log-learnig-rate-to-tensorboard] +[default4]: [--no-log-loss-scale-to-tensorboard] +[default4]: [--log-validation-ppl-to-tensorboard] +[default4]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default4]: [--zero-contigious-gradients] +[default4]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default4]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default4]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default4]: [--scattered-embeddings] [--split-transformers] +[default4]: [--memory-centric-tiled-linear] +[default4]: [--tile-factor TILE_FACTOR] +[default4]: [--deepspeed-activation-checkpointing] +[default4]: [--partition-activations] [--contigious-checkpointing] +[default4]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default4]: [--profile-backward] [--deepspeed] +[default4]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default4]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default4]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default3]: [--DDP-impl {local,torch}] +[default3]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default3]: [--hidden-size HIDDEN_SIZE] +[default3]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default3]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default3]: [--kv-channels KV_CHANNELS] +[default3]: [--use-contiguous-buffers-in-ddp] +[default3]: [--no-scatter-gather-tensors-in-pipeline] +[default3]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default3]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default3]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default3]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default3]: [--sync-tp-duplicated-parameters] +[default3]: [--apply-residual-connection-post-layernorm] +[default3]: [--embed-layernorm] [--openai-gelu] +[default3]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default3]: [--local_rank LOCAL_RANK] +[default3]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default3]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default3]: [--kill-switch-path KILL_SWITCH_PATH] +[default3]: [--log-level {debug,info,warning,error,critical}] +[default3]: [--log-level-replica {debug,info,warning,error,critical}] +[default3]: [--attention-dropout ATTENTION_DROPOUT] +[default3]: [--hidden-dropout HIDDEN_DROPOUT] +[default4]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default3]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default3]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default3]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default3]: [--micro-batch-size MICRO_BATCH_SIZE] +[default3]: [--batch-size BATCH_SIZE] +[default3]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default3]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default4]: [--tensorboard-dir TENSORBOARD_DIR] +[default3]: [--checkpoint-activations] +[default3]: [--distribute-checkpointed-activations] +[default3]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default3]: [--train-iters TRAIN_ITERS] +[default3]: [--train-samples TRAIN_SAMPLES] +[default3]: [--train-tokens TRAIN_TOKENS] +[default3]: [--log-interval LOG_INTERVAL] +[default3]: [--exit-interval EXIT_INTERVAL] +[default3]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default3]: [--tensorboard-dir TENSORBOARD_DIR] +[default3]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default3]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default3]: [--use-bnb-optimizer] +[default3]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default3]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default3]: [--eval-only EVAL_ONLY] +[default3]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default3]: [--inference] +[default3]: [--abort-on-unmet-fused-kernel-constraints] +[default3]: [--pp-partition-method PP_PARTITION_METHOD] +[default3]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default3]: [--init-method-xavier-uniform] [--lr LR] +[default3]: [--lr-decay-style {constant,linear,cosine}] +[default3]: [--lr-decay-iters LR_DECAY_ITERS] +[default3]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default3]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default3]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default3]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default3]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default3]: [--warmup WARMUP] [--min-lr MIN_LR] +[default3]: [--override-lr-scheduler] +[default3]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default3]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default3]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default3]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default3]: [--loss-scale LOSS_SCALE] +[default3]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default4]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default3]: [--min-loss-scale MIN_LOSS_SCALE] +[default3]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default3]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default3]: [--no-query-key-layer-scaling] +[default3]: [--attention-softmax-in-fp32] +[default4]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default3]: [--accumulate-allreduce-grads-in-fp32] +[default3]: [--fp16-lm-cross-entropy] +[default3]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default3]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default3]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default4]: [--use-bnb-optimizer] +[default4]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default3]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default3]: [--distributed-backend {nccl,gloo}] +[default3]: [--DDP-impl {local,torch}] +[default3]: [--use-contiguous-buffers-in-ddp] +[default3]: [--no-scatter-gather-tensors-in-pipeline] +[default3]: [--local_rank LOCAL_RANK] +[default3]: [--lazy-mpu-init LAZY_MPU_INIT] +[default3]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default4]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default3]: [--eval-interval EVAL_INTERVAL] +[default3]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default3]: [--split SPLIT] +[default3]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--eod-mask-loss] [--loss-on-targets-only] +[default3]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default3]: [--merge-file MERGE_FILE] +[default3]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default3]: [--seq-length SEQ_LENGTH] +[default3]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default3]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default3]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default3]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default3]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default3]: [--num-workers NUM_WORKERS] +[default3]: [--valid-num-workers VALID_NUM_WORKERS] +[default3]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default3]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default3]: [--data-impl {lazy,cached,mmap,infer}] +[default3]: [--reset-position-ids] [--reset-attention-mask] +[default3]: [--eod-mask-loss] [--loss-on-targets-only] +[default3]: [--norm-target-loss] +[default3]: [--reweight-loss-based-on-position-frequency] +[default3]: [--noise-density NOISE_DENSITY] +[default3]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default3]: [--prefixlm] [--adlr-autoresume] +[default3]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default3]: [--ict-head-size ICT_HEAD_SIZE] +[default3]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default3]: [--biencoder-shared-query-context-model] +[default3]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default3]: [--titles-data-path TITLES_DATA_PATH] +[default3]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default3]: [--use-one-sent-docs] +[default3]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default3]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default3]: [--retriever-score-scaling] +[default3]: [--block-data-path BLOCK_DATA_PATH] +[default3]: [--embedding-path EMBEDDING_PATH] +[default3]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default3]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default3]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default3]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default3]: [--log-params-norm] [--log-num-zeros-in-grad] +[default3]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default1]: [--norm-target-loss] +[default4]: [--eval-only EVAL_ONLY] +[default4]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default4]: [--inference] +[default4]: [--abort-on-unmet-fused-kernel-constraints] +[default3]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default3]: [--log-timers-to-tensorboard] +[default3]: [--log-batch-size-to-tensorboard] +[default3]: [--no-log-learnig-rate-to-tensorboard] +[default3]: [--no-log-loss-scale-to-tensorboard] +[default3]: [--log-validation-ppl-to-tensorboard] +[default3]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default3]: [--zero-contigious-gradients] +[default3]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default3]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default2]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default3]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default3]: [--scattered-embeddings] [--split-transformers] +[default3]: [--memory-centric-tiled-linear] +[default3]: [--tile-factor TILE_FACTOR] +[default3]: [--deepspeed-activation-checkpointing] +[default3]: [--partition-activations] [--contigious-checkpointing] +[default3]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default3]: [--profile-backward] [--deepspeed] +[default3]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default3]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default3]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default2]: [--distributed-backend {nccl,gloo}] +[default2]: [--DDP-impl {local,torch}] +[default3]: [--lazy-mpu-init LAZY_MPU_INIT] +[default3]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default3]: [--eval-interval EVAL_INTERVAL] +[default2]: [--use-contiguous-buffers-in-ddp] +[default2]: [--no-scatter-gather-tensors-in-pipeline] +[default4]: [--pp-partition-method PP_PARTITION_METHOD] +[default4]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default4]: [--init-method-xavier-uniform] [--lr LR] +[default1]: [--reweight-loss-based-on-position-frequency] +[default1]: [--noise-density NOISE_DENSITY] +[default4]: [--lr-decay-style {constant,linear,cosine}] +[default3]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default3]: [--split SPLIT] +[default2]: [--local_rank LOCAL_RANK] +[default2]: [--lazy-mpu-init LAZY_MPU_INIT] +[default2]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default1]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default4]: [--lr-decay-iters LR_DECAY_ITERS] +[default4]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default4]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default4]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default4]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default4]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default3]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--warmup WARMUP] [--min-lr MIN_LR] +[default4]: [--override-lr-scheduler] +[default4]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default4]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default1]: [--prefixlm] [--adlr-autoresume] +[default1]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default1]: [--ict-head-size ICT_HEAD_SIZE] +[default1]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default1]: [--biencoder-shared-query-context-model] +[default3]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default3]: [--merge-file MERGE_FILE] +[default3]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default4]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default2]: [--eval-interval EVAL_INTERVAL] +[default4]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default4]: [--loss-scale LOSS_SCALE] +[default4]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default1]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default1]: [--titles-data-path TITLES_DATA_PATH] +[default1]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default4]: [--min-loss-scale MIN_LOSS_SCALE] +[default3]: [--seq-length SEQ_LENGTH] +[default3]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default4]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default4]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default3]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default3]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default4]: [--no-query-key-layer-scaling] +[default2]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default2]: [--split SPLIT] +[default4]: [--attention-softmax-in-fp32] +[default3]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default3]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default2]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--accumulate-allreduce-grads-in-fp32] +[default4]: [--fp16-lm-cross-entropy] +[default3]: [--num-workers NUM_WORKERS] +[default3]: [--valid-num-workers VALID_NUM_WORKERS] +[default3]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default3]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default3]: [--data-impl {lazy,cached,mmap,infer}] +[default1]: [--use-one-sent-docs] +[default1]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default1]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default1]: [--retriever-score-scaling] +[default1]: [--block-data-path BLOCK_DATA_PATH] +[default1]: [--embedding-path EMBEDDING_PATH] +[default1]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default1]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default2]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default2]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default1]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default2]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--reset-position-ids] [--reset-attention-mask] +[default4]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default4]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default4]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default4]: [--distributed-backend {nccl,gloo}] +[default2]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default2]: [--merge-file MERGE_FILE] +[default2]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default4]: [--DDP-impl {local,torch}] +[default4]: [--use-contiguous-buffers-in-ddp] +[default4]: [--no-scatter-gather-tensors-in-pipeline] +[default3]: [--eod-mask-loss] [--loss-on-targets-only] +[default3]: [--norm-target-loss] +[default3]: [--reweight-loss-based-on-position-frequency] +[default2]: [--seq-length SEQ_LENGTH] +[default4]: [--local_rank LOCAL_RANK] +[default1]: [--log-params-norm] [--log-num-zeros-in-grad] +[default1]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default1]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default1]: [--log-timers-to-tensorboard] +[default2]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default3]: [--noise-density NOISE_DENSITY] +[default3]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default3]: [--prefixlm] [--adlr-autoresume] +[default3]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default4]: [--lazy-mpu-init LAZY_MPU_INIT] +[default4]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default4]: [--eval-interval EVAL_INTERVAL] +[default4]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default4]: [--split SPLIT] +[default4]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--log-batch-size-to-tensorboard] +[default2]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default2]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default1]: [--no-log-learnig-rate-to-tensorboard] +[default1]: [--no-log-loss-scale-to-tensorboard] +[default2]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default3]: [--ict-head-size ICT_HEAD_SIZE] +[default1]: [--log-validation-ppl-to-tensorboard] +[default2]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default2]: [--num-workers NUM_WORKERS] +[default2]: [--valid-num-workers VALID_NUM_WORKERS] +[default2]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default2]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default4]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--data-impl {lazy,cached,mmap,infer}] +[default2]: [--reset-position-ids] [--reset-attention-mask] +[default2]: [--eod-mask-loss] [--loss-on-targets-only] +[default4]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default3]: [--biencoder-shared-query-context-model] +[default4]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default1]: [--zero-contigious-gradients] +[default3]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default3]: [--titles-data-path TITLES_DATA_PATH] +[default1]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default1]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default1]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default3]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default3]: [--use-one-sent-docs] +[default4]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default4]: [--merge-file MERGE_FILE] +[default3]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default2]: [--norm-target-loss] +[default3]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default3]: [--retriever-score-scaling] +[default4]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default4]: [--seq-length SEQ_LENGTH] +[default1]: [--scattered-embeddings] [--split-transformers] +[default1]: [--memory-centric-tiled-linear] +[default1]: [--tile-factor TILE_FACTOR] +[default1]: [--deepspeed-activation-checkpointing] +[default4]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default4]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default1]: [--partition-activations] [--contigious-checkpointing] +[default2]: [--reweight-loss-based-on-position-frequency] +[default2]: [--noise-density NOISE_DENSITY] +[default2]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default2]: [--prefixlm] [--adlr-autoresume] +[default2]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default4]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default3]: [--block-data-path BLOCK_DATA_PATH] +[default2]: [--ict-head-size ICT_HEAD_SIZE] +[default2]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default2]: [--biencoder-shared-query-context-model] +[default2]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default1]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default4]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default4]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default4]: [--num-workers NUM_WORKERS] +[default1]: [--profile-backward] [--deepspeed] +[default1]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default1]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default1]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default4]: [--valid-num-workers VALID_NUM_WORKERS] +[default4]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default3]: [--embedding-path EMBEDDING_PATH] +[default3]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default3]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default3]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default4]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default2]: [--titles-data-path TITLES_DATA_PATH] +[default2]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default3]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default3]: [--log-params-norm] [--log-num-zeros-in-grad] +[default3]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default3]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default3]: [--log-timers-to-tensorboard] +[default4]: [--data-impl {lazy,cached,mmap,infer}] +[default3]: [--log-batch-size-to-tensorboard] +[default4]: [--reset-position-ids] [--reset-attention-mask] +[default3]: [--no-log-learnig-rate-to-tensorboard] +[default4]: [--eod-mask-loss] [--loss-on-targets-only] +[default3]: [--no-log-loss-scale-to-tensorboard] +[default4]: [--norm-target-loss] +[default2]: [--use-one-sent-docs] +[default3]: [--log-validation-ppl-to-tensorboard] +[default3]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default3]: [--zero-contigious-gradients] +[default4]: [--reweight-loss-based-on-position-frequency] +[default2]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default2]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default2]: [--retriever-score-scaling] +[default3]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default3]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default3]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default3]: [--scattered-embeddings] [--split-transformers] +[default4]: [--noise-density NOISE_DENSITY] +[default4]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default3]: [--memory-centric-tiled-linear] +[default3]: [--tile-factor TILE_FACTOR] +[default2]: [--block-data-path BLOCK_DATA_PATH] +[default2]: [--embedding-path EMBEDDING_PATH] +[default3]: [--deepspeed-activation-checkpointing] +[default3]: [--partition-activations] [--contigious-checkpointing] +[default3]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default3]: [--profile-backward] [--deepspeed] +[default4]: [--prefixlm] [--adlr-autoresume] +[default2]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default2]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default4]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default4]: [--ict-head-size ICT_HEAD_SIZE] +[default2]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default4]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default4]: [--biencoder-shared-query-context-model] +[default2]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default3]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default3]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default3]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default4]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default2]: [--log-params-norm] [--log-num-zeros-in-grad] +[default2]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default4]: [--titles-data-path TITLES_DATA_PATH] +[default2]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default4]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default4]: [--use-one-sent-docs] +[default2]: [--log-timers-to-tensorboard] +[default2]: [--log-batch-size-to-tensorboard] +[default4]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default4]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default2]: [--no-log-learnig-rate-to-tensorboard] +[default2]: [--no-log-loss-scale-to-tensorboard] +[default4]: [--retriever-score-scaling] +[default4]: [--block-data-path BLOCK_DATA_PATH] +[default4]: [--embedding-path EMBEDDING_PATH] +[default4]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default2]: [--log-validation-ppl-to-tensorboard] +[default4]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default4]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default2]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default2]: [--zero-contigious-gradients] +[default2]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default4]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default4]: [--log-params-norm] [--log-num-zeros-in-grad] +[default4]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default2]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default2]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default2]: [--scattered-embeddings] [--split-transformers] +[default4]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default4]: [--log-timers-to-tensorboard] +[default4]: [--log-batch-size-to-tensorboard] +[default4]: [--no-log-learnig-rate-to-tensorboard] +[default2]: [--memory-centric-tiled-linear] +[default2]: [--tile-factor TILE_FACTOR] +[default2]: [--deepspeed-activation-checkpointing] +[default4]: [--no-log-loss-scale-to-tensorboard] +[default4]: [--log-validation-ppl-to-tensorboard] +[default4]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default4]: [--zero-contigious-gradients] +[default4]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default2]: [--partition-activations] [--contigious-checkpointing] +[default2]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default2]: [--profile-backward] [--deepspeed] +[default4]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default4]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default4]: [--scattered-embeddings] [--split-transformers] +[default2]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default2]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default2]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default4]: [--memory-centric-tiled-linear] +[default4]: [--tile-factor TILE_FACTOR] +[default4]: [--deepspeed-activation-checkpointing] +[default4]: [--partition-activations] [--contigious-checkpointing] +[default4]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default4]: [--profile-backward] [--deepspeed] +[default4]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default4]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default4]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default5]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default6]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default6]: [--hidden-size HIDDEN_SIZE] +[default6]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default6]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default6]: [--kv-channels KV_CHANNELS] +[default6]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default6]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default6]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default6]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default6]: [--sync-tp-duplicated-parameters] +[default6]: [--apply-residual-connection-post-layernorm] +[default6]: [--embed-layernorm] [--openai-gelu] +[default6]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default6]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default6]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default6]: [--kill-switch-path KILL_SWITCH_PATH] +[default6]: [--log-level {debug,info,warning,error,critical}] +[default6]: [--log-level-replica {debug,info,warning,error,critical}] +[default6]: [--attention-dropout ATTENTION_DROPOUT] +[default6]: [--hidden-dropout HIDDEN_DROPOUT] +[default6]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default6]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default6]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default6]: [--micro-batch-size MICRO_BATCH_SIZE] +[default6]: [--batch-size BATCH_SIZE] +[default6]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default6]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default6]: [--checkpoint-activations] +[default6]: [--distribute-checkpointed-activations] +[default6]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default6]: [--train-iters TRAIN_ITERS] +[default6]: [--train-samples TRAIN_SAMPLES] +[default6]: [--train-tokens TRAIN_TOKENS] +[default6]: [--log-interval LOG_INTERVAL] +[default6]: [--exit-interval EXIT_INTERVAL] +[default6]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default6]: [--tensorboard-dir TENSORBOARD_DIR] +[default6]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default6]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default6]: [--use-bnb-optimizer] +[default6]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default6]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default6]: [--eval-only EVAL_ONLY] +[default6]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default6]: [--inference] +[default6]: [--abort-on-unmet-fused-kernel-constraints] +[default6]: [--pp-partition-method PP_PARTITION_METHOD] +[default6]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default6]: [--init-method-xavier-uniform] [--lr LR] +[default6]: [--lr-decay-style {constant,linear,cosine}] +[default6]: [--lr-decay-iters LR_DECAY_ITERS] +[default6]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default6]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default6]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default6]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default6]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default6]: [--warmup WARMUP] [--min-lr MIN_LR] +[default6]: [--override-lr-scheduler] +[default6]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default6]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default6]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default6]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default6]: [--loss-scale LOSS_SCALE] +[default6]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default6]: [--min-loss-scale MIN_LOSS_SCALE] +[default6]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default6]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default6]: [--no-query-key-layer-scaling] +[default6]: [--attention-softmax-in-fp32] +[default6]: [--accumulate-allreduce-grads-in-fp32] +[default6]: [--fp16-lm-cross-entropy] +[default6]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default6]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default6]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default6]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default6]: [--distributed-backend {nccl,gloo}] +[default6]: [--DDP-impl {local,torch}] +[default6]: [--use-contiguous-buffers-in-ddp] +[default6]: [--no-scatter-gather-tensors-in-pipeline] +[default6]: [--local_rank LOCAL_RANK] +[default6]: [--lazy-mpu-init LAZY_MPU_INIT] +[default6]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default6]: [--eval-interval EVAL_INTERVAL] +[default6]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default6]: [--split SPLIT] +[default6]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default6]: [--merge-file MERGE_FILE] +[default6]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default6]: [--seq-length SEQ_LENGTH] +[default6]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default6]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default6]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default6]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default6]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default6]: [--num-workers NUM_WORKERS] +[default6]: [--valid-num-workers VALID_NUM_WORKERS] +[default6]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default5]: [--hidden-size HIDDEN_SIZE] +[default5]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default5]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default5]: [--kv-channels KV_CHANNELS] +[default6]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default5]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default6]: [--data-impl {lazy,cached,mmap,infer}] +[default5]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default5]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default7]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default7]: [--hidden-size HIDDEN_SIZE] +[default7]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default7]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default7]: [--kv-channels KV_CHANNELS] +[default7]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default7]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default7]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default7]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default7]: [--sync-tp-duplicated-parameters] +[default7]: [--apply-residual-connection-post-layernorm] +[default7]: [--embed-layernorm] [--openai-gelu] +[default7]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default7]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default5]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default7]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default7]: [--kill-switch-path KILL_SWITCH_PATH] +[default7]: [--log-level {debug,info,warning,error,critical}] +[default7]: [--log-level-replica {debug,info,warning,error,critical}] +[default7]: [--attention-dropout ATTENTION_DROPOUT] +[default7]: [--hidden-dropout HIDDEN_DROPOUT] +[default7]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default7]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default7]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default7]: [--micro-batch-size MICRO_BATCH_SIZE] +[default7]: [--batch-size BATCH_SIZE] +[default7]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default7]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default7]: [--checkpoint-activations] +[default7]: [--distribute-checkpointed-activations] +[default7]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default7]: [--train-iters TRAIN_ITERS] +[default7]: [--train-samples TRAIN_SAMPLES] +[default7]: [--train-tokens TRAIN_TOKENS] +[default5]: [--sync-tp-duplicated-parameters] +[default5]: [--apply-residual-connection-post-layernorm] +[default5]: [--embed-layernorm] [--openai-gelu] +[default5]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default5]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default5]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default7]: [--log-interval LOG_INTERVAL] +[default7]: [--exit-interval EXIT_INTERVAL] +[default7]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default7]: [--tensorboard-dir TENSORBOARD_DIR] +[default7]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default7]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default7]: [--use-bnb-optimizer] +[default7]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default7]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default7]: [--eval-only EVAL_ONLY] +[default7]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default7]: [--inference] +[default7]: [--abort-on-unmet-fused-kernel-constraints] +[default7]: [--pp-partition-method PP_PARTITION_METHOD] +[default5]: [--kill-switch-path KILL_SWITCH_PATH] +[default7]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default5]: [--log-level {debug,info,warning,error,critical}] +[default5]: [--log-level-replica {debug,info,warning,error,critical}] +[default6]: [--reset-position-ids] [--reset-attention-mask] +[default6]: [--eod-mask-loss] [--loss-on-targets-only] +[default6]: [--norm-target-loss] +[default6]: [--reweight-loss-based-on-position-frequency] +[default6]: [--noise-density NOISE_DENSITY] +[default6]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default6]: [--prefixlm] [--adlr-autoresume] +[default6]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default6]: [--ict-head-size ICT_HEAD_SIZE] +[default5]: [--attention-dropout ATTENTION_DROPOUT] +[default5]: [--hidden-dropout HIDDEN_DROPOUT] +[default5]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default5]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default7]: [--init-method-xavier-uniform] [--lr LR] +[default7]: [--lr-decay-style {constant,linear,cosine}] +[default7]: [--lr-decay-iters LR_DECAY_ITERS] +[default7]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default7]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default7]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default7]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default7]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default7]: [--warmup WARMUP] [--min-lr MIN_LR] +[default7]: [--override-lr-scheduler] +[default5]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default5]: [--micro-batch-size MICRO_BATCH_SIZE] +[default7]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default7]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default7]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default5]: [--batch-size BATCH_SIZE] +[default5]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default6]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default6]: [--biencoder-shared-query-context-model] +[default6]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default5]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default5]: [--checkpoint-activations] +[default7]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default7]: [--loss-scale LOSS_SCALE] +[default5]: [--distribute-checkpointed-activations] +[default5]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default7]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default7]: [--min-loss-scale MIN_LOSS_SCALE] +[default7]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default5]: [--train-iters TRAIN_ITERS] +[default5]: [--train-samples TRAIN_SAMPLES] +[default5]: [--train-tokens TRAIN_TOKENS] +[default5]: [--log-interval LOG_INTERVAL] +[default5]: [--exit-interval EXIT_INTERVAL] +[default7]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default7]: [--no-query-key-layer-scaling] +[default5]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default5]: [--tensorboard-dir TENSORBOARD_DIR] +[default6]: [--titles-data-path TITLES_DATA_PATH] +[default6]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default7]: [--attention-softmax-in-fp32] +[default7]: [--accumulate-allreduce-grads-in-fp32] +[default5]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default5]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default7]: [--fp16-lm-cross-entropy] +[default7]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default7]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default5]: [--use-bnb-optimizer] +[default5]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default5]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default5]: [--eval-only EVAL_ONLY] +[default7]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default7]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default7]: [--distributed-backend {nccl,gloo}] +[default5]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default5]: [--inference] +[default6]: [--use-one-sent-docs] +[default6]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default6]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default6]: [--retriever-score-scaling] +[default7]: [--DDP-impl {local,torch}] +[default7]: [--use-contiguous-buffers-in-ddp] +[default6]: [--block-data-path BLOCK_DATA_PATH] +[default6]: [--embedding-path EMBEDDING_PATH] +[default5]: [--abort-on-unmet-fused-kernel-constraints] +[default5]: [--pp-partition-method PP_PARTITION_METHOD] +[default5]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default5]: [--init-method-xavier-uniform] [--lr LR] +[default7]: [--no-scatter-gather-tensors-in-pipeline] +[default7]: [--local_rank LOCAL_RANK] +[default6]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default6]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default6]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default5]: [--lr-decay-style {constant,linear,cosine}] +[default5]: [--lr-decay-iters LR_DECAY_ITERS] +[default5]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default5]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default6]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default6]: [--log-params-norm] [--log-num-zeros-in-grad] +[default7]: [--lazy-mpu-init LAZY_MPU_INIT] +[default7]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default5]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default5]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default5]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default5]: [--warmup WARMUP] [--min-lr MIN_LR] +[default7]: [--eval-interval EVAL_INTERVAL] +[default5]: [--override-lr-scheduler] +[default5]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default5]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default7]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default7]: [--split SPLIT] +[default6]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default7]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default6]: [--log-timers-to-tensorboard] +[default6]: [--log-batch-size-to-tensorboard] +[default6]: [--no-log-learnig-rate-to-tensorboard] +[default6]: [--no-log-loss-scale-to-tensorboard] +[default6]: [--log-validation-ppl-to-tensorboard] +[default6]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default7]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default5]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default7]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--zero-contigious-gradients] +[default6]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default7]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default7]: [--merge-file MERGE_FILE] +[default7]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default7]: [--seq-length SEQ_LENGTH] +[default6]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default7]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default7]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default6]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default6]: [--scattered-embeddings] [--split-transformers] +[default6]: [--memory-centric-tiled-linear] +[default6]: [--tile-factor TILE_FACTOR] +[default5]: [--loss-scale LOSS_SCALE] +[default5]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default7]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default7]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default6]: [--deepspeed-activation-checkpointing] +[default6]: [--partition-activations] [--contigious-checkpointing] +[default5]: [--min-loss-scale MIN_LOSS_SCALE] +[default5]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default7]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default7]: [--num-workers NUM_WORKERS] +[default5]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default5]: [--no-query-key-layer-scaling] +[default6]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default6]: [--profile-backward] [--deepspeed] +[default6]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default7]: [--valid-num-workers VALID_NUM_WORKERS] +[default7]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default7]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default7]: [--data-impl {lazy,cached,mmap,infer}] +[default6]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default6]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default5]: [--attention-softmax-in-fp32] +[default5]: [--accumulate-allreduce-grads-in-fp32] +[default5]: [--fp16-lm-cross-entropy] +[default7]: [--reset-position-ids] [--reset-attention-mask] +[default7]: [--eod-mask-loss] [--loss-on-targets-only] +[default7]: [--norm-target-loss] +[default7]: [--reweight-loss-based-on-position-frequency] +[default7]: [--noise-density NOISE_DENSITY] +[default5]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default5]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default5]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default5]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default5]: [--distributed-backend {nccl,gloo}] +[default7]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default7]: [--prefixlm] [--adlr-autoresume] +[default5]: [--DDP-impl {local,torch}] +[default5]: [--use-contiguous-buffers-in-ddp] +[default5]: [--no-scatter-gather-tensors-in-pipeline] +[default5]: [--local_rank LOCAL_RANK] +[default7]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default7]: [--ict-head-size ICT_HEAD_SIZE] +[default5]: [--lazy-mpu-init LAZY_MPU_INIT] +[default5]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default5]: [--eval-interval EVAL_INTERVAL] +[default5]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default7]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default7]: [--biencoder-shared-query-context-model] +[default5]: [--split SPLIT] +[default5]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default7]: [--titles-data-path TITLES_DATA_PATH] +[default5]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default7]: [--use-one-sent-docs] +[default5]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default7]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default7]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default5]: [--merge-file MERGE_FILE] +[default5]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default5]: [--seq-length SEQ_LENGTH] +[default5]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default7]: [--retriever-score-scaling] +[default5]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default7]: [--block-data-path BLOCK_DATA_PATH] +[default7]: [--embedding-path EMBEDDING_PATH] +[default5]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default5]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default5]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default7]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default7]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default5]: [--num-workers NUM_WORKERS] +[default5]: [--valid-num-workers VALID_NUM_WORKERS] +[default5]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default5]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default7]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default7]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default5]: [--data-impl {lazy,cached,mmap,infer}] +[default5]: [--reset-position-ids] [--reset-attention-mask] +[default5]: [--eod-mask-loss] [--loss-on-targets-only] +[default5]: [--norm-target-loss] +[default7]: [--log-params-norm] [--log-num-zeros-in-grad] +[default7]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default5]: [--reweight-loss-based-on-position-frequency] +[default5]: [--noise-density NOISE_DENSITY] +[default5]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default5]: [--prefixlm] [--adlr-autoresume] +[default7]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default7]: [--log-timers-to-tensorboard] +[default7]: [--log-batch-size-to-tensorboard] +[default5]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default5]: [--ict-head-size ICT_HEAD_SIZE] +[default5]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default7]: [--no-log-learnig-rate-to-tensorboard] +[default7]: [--no-log-loss-scale-to-tensorboard] +[default5]: [--biencoder-shared-query-context-model] +[default5]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default5]: [--titles-data-path TITLES_DATA_PATH] +[default5]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default5]: [--use-one-sent-docs] +[default7]: [--log-validation-ppl-to-tensorboard] +[default7]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default5]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default7]: [--zero-contigious-gradients] +[default7]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default5]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default5]: [--retriever-score-scaling] +[default7]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default7]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default5]: [--block-data-path BLOCK_DATA_PATH] +[default5]: [--embedding-path EMBEDDING_PATH] +[default5]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default7]: [--scattered-embeddings] [--split-transformers] +[default7]: [--memory-centric-tiled-linear] +[default5]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default5]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default5]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default5]: [--log-params-norm] [--log-num-zeros-in-grad] +[default7]: [--tile-factor TILE_FACTOR] +[default7]: [--deepspeed-activation-checkpointing] +[default5]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default5]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default5]: [--log-timers-to-tensorboard] +[default5]: [--log-batch-size-to-tensorboard] +[default7]: [--partition-activations] [--contigious-checkpointing] +[default7]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default7]: [--profile-backward] [--deepspeed] +[default7]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default7]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default7]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default5]: [--no-log-learnig-rate-to-tensorboard] +[default5]: [--no-log-loss-scale-to-tensorboard] +[default5]: [--log-validation-ppl-to-tensorboard] +[default5]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default5]: [--zero-contigious-gradients] +[default5]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default5]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default5]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default5]: [--scattered-embeddings] [--split-transformers] +[default5]: [--memory-centric-tiled-linear] +[default5]: [--tile-factor TILE_FACTOR] +[default5]: [--deepspeed-activation-checkpointing] +[default5]: [--partition-activations] [--contigious-checkpointing] +[default5]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default5]: [--profile-backward] [--deepspeed] +[default5]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default5]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default5]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default6]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default6]: [--hidden-size HIDDEN_SIZE] +[default6]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default6]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default6]: [--kv-channels KV_CHANNELS] +[default6]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default6]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default6]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default6]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default6]: [--sync-tp-duplicated-parameters] +[default6]: [--apply-residual-connection-post-layernorm] +[default6]: [--embed-layernorm] [--openai-gelu] +[default6]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default6]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default6]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default6]: [--kill-switch-path KILL_SWITCH_PATH] +[default6]: [--log-level {debug,info,warning,error,critical}] +[default6]: [--log-level-replica {debug,info,warning,error,critical}] +[default6]: [--attention-dropout ATTENTION_DROPOUT] +[default6]: [--hidden-dropout HIDDEN_DROPOUT] +[default6]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default6]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default6]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default6]: [--micro-batch-size MICRO_BATCH_SIZE] +[default6]: [--batch-size BATCH_SIZE] +[default6]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default6]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default6]: [--checkpoint-activations] +[default6]: [--distribute-checkpointed-activations] +[default6]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default6]: [--train-iters TRAIN_ITERS] +[default6]: [--train-samples TRAIN_SAMPLES] +[default6]: [--train-tokens TRAIN_TOKENS] +[default6]: [--log-interval LOG_INTERVAL] +[default6]: [--exit-interval EXIT_INTERVAL] +[default6]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default6]: [--tensorboard-dir TENSORBOARD_DIR] +[default6]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default6]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default6]: [--use-bnb-optimizer] +[default6]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default6]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default6]: [--eval-only EVAL_ONLY] +[default6]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default6]: [--inference] +[default6]: [--abort-on-unmet-fused-kernel-constraints] +[default6]: [--pp-partition-method PP_PARTITION_METHOD] +[default6]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default6]: [--init-method-xavier-uniform] [--lr LR] +[default6]: [--lr-decay-style {constant,linear,cosine}] +[default6]: [--lr-decay-iters LR_DECAY_ITERS] +[default6]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default6]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default6]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default6]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default6]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default6]: [--warmup WARMUP] [--min-lr MIN_LR] +[default6]: [--override-lr-scheduler] +[default6]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default6]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default6]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default6]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default6]: [--loss-scale LOSS_SCALE] +[default6]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default6]: [--min-loss-scale MIN_LOSS_SCALE] +[default6]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default6]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default6]: [--no-query-key-layer-scaling] +[default6]: [--attention-softmax-in-fp32] +[default6]: [--accumulate-allreduce-grads-in-fp32] +[default6]: [--fp16-lm-cross-entropy] +[default6]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default6]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default6]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default6]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default6]: [--distributed-backend {nccl,gloo}] +[default6]: [--DDP-impl {local,torch}] +[default6]: [--use-contiguous-buffers-in-ddp] +[default6]: [--no-scatter-gather-tensors-in-pipeline] +[default6]: [--local_rank LOCAL_RANK] +[default6]: [--lazy-mpu-init LAZY_MPU_INIT] +[default6]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default6]: [--eval-interval EVAL_INTERVAL] +[default6]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default6]: [--split SPLIT] +[default6]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default6]: [--merge-file MERGE_FILE] +[default6]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default6]: [--seq-length SEQ_LENGTH] +[default6]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default6]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default6]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default6]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default6]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default6]: [--num-workers NUM_WORKERS] +[default6]: [--valid-num-workers VALID_NUM_WORKERS] +[default6]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default6]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default6]: [--data-impl {lazy,cached,mmap,infer}] +[default6]: [--reset-position-ids] [--reset-attention-mask] +[default6]: [--eod-mask-loss] [--loss-on-targets-only] +[default6]: [--norm-target-loss] +[default6]: [--reweight-loss-based-on-position-frequency] +[default6]: [--noise-density NOISE_DENSITY] +[default6]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default6]: [--prefixlm] [--adlr-autoresume] +[default6]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default6]: [--ict-head-size ICT_HEAD_SIZE] +[default6]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default6]: [--biencoder-shared-query-context-model] +[default6]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default6]: [--titles-data-path TITLES_DATA_PATH] +[default6]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default6]: [--use-one-sent-docs] +[default6]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default6]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default6]: [--retriever-score-scaling] +[default6]: [--block-data-path BLOCK_DATA_PATH] +[default6]: [--embedding-path EMBEDDING_PATH] +[default6]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default6]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default6]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default6]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default6]: [--log-params-norm] [--log-num-zeros-in-grad] +[default6]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default6]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default6]: [--log-timers-to-tensorboard] +[default6]: [--log-batch-size-to-tensorboard] +[default6]: [--no-log-learnig-rate-to-tensorboard] +[default6]: [--no-log-loss-scale-to-tensorboard] +[default6]: [--log-validation-ppl-to-tensorboard] +[default6]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default6]: [--zero-contigious-gradients] +[default6]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default6]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default6]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default6]: [--scattered-embeddings] [--split-transformers] +[default6]: [--memory-centric-tiled-linear] +[default6]: [--tile-factor TILE_FACTOR] +[default6]: [--deepspeed-activation-checkpointing] +[default6]: [--partition-activations] [--contigious-checkpointing] +[default6]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default6]: [--profile-backward] [--deepspeed] +[default6]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default6]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default6]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default7]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default7]: [--hidden-size HIDDEN_SIZE] +[default7]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default7]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default7]: [--kv-channels KV_CHANNELS] +[default7]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default7]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default7]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default7]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default7]: [--sync-tp-duplicated-parameters] +[default7]: [--apply-residual-connection-post-layernorm] +[default7]: [--embed-layernorm] [--openai-gelu] +[default7]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default7]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default7]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default7]: [--kill-switch-path KILL_SWITCH_PATH] +[default7]: [--log-level {debug,info,warning,error,critical}] +[default7]: [--log-level-replica {debug,info,warning,error,critical}] +[default7]: [--attention-dropout ATTENTION_DROPOUT] +[default7]: [--hidden-dropout HIDDEN_DROPOUT] +[default7]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default7]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default7]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default7]: [--micro-batch-size MICRO_BATCH_SIZE] +[default7]: [--batch-size BATCH_SIZE] +[default7]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default7]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default7]: [--checkpoint-activations] +[default7]: [--distribute-checkpointed-activations] +[default7]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default7]: [--train-iters TRAIN_ITERS] +[default7]: [--train-samples TRAIN_SAMPLES] +[default7]: [--train-tokens TRAIN_TOKENS] +[default7]: [--log-interval LOG_INTERVAL] +[default7]: [--exit-interval EXIT_INTERVAL] +[default7]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default7]: [--tensorboard-dir TENSORBOARD_DIR] +[default7]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default7]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default7]: [--use-bnb-optimizer] +[default7]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default7]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default7]: [--eval-only EVAL_ONLY] +[default7]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default7]: [--inference] +[default7]: [--abort-on-unmet-fused-kernel-constraints] +[default7]: [--pp-partition-method PP_PARTITION_METHOD] +[default7]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default7]: [--init-method-xavier-uniform] [--lr LR] +[default7]: [--lr-decay-style {constant,linear,cosine}] +[default7]: [--lr-decay-iters LR_DECAY_ITERS] +[default7]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default7]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default7]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default7]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default7]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default7]: [--warmup WARMUP] [--min-lr MIN_LR] +[default7]: [--override-lr-scheduler] +[default7]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default7]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default7]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default7]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default7]: [--loss-scale LOSS_SCALE] +[default7]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default7]: [--min-loss-scale MIN_LOSS_SCALE] +[default7]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default7]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default7]: [--no-query-key-layer-scaling] +[default7]: [--attention-softmax-in-fp32] +[default7]: [--accumulate-allreduce-grads-in-fp32] +[default7]: [--fp16-lm-cross-entropy] +[default7]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default7]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default7]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default7]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default7]: [--distributed-backend {nccl,gloo}] +[default7]: [--DDP-impl {local,torch}] +[default7]: [--use-contiguous-buffers-in-ddp] +[default7]: [--no-scatter-gather-tensors-in-pipeline] +[default7]: [--local_rank LOCAL_RANK] +[default7]: [--lazy-mpu-init LAZY_MPU_INIT] +[default7]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default7]: [--eval-interval EVAL_INTERVAL] +[default7]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default7]: [--split SPLIT] +[default7]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default7]: [--merge-file MERGE_FILE] +[default7]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default7]: [--seq-length SEQ_LENGTH] +[default7]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default7]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default7]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default7]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default7]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default7]: [--num-workers NUM_WORKERS] +[default7]: [--valid-num-workers VALID_NUM_WORKERS] +[default7]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default7]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default7]: [--data-impl {lazy,cached,mmap,infer}] +[default7]: [--reset-position-ids] [--reset-attention-mask] +[default7]: [--eod-mask-loss] [--loss-on-targets-only] +[default7]: [--norm-target-loss] +[default7]: [--reweight-loss-based-on-position-frequency] +[default7]: [--noise-density NOISE_DENSITY] +[default7]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default7]: [--prefixlm] [--adlr-autoresume] +[default7]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default7]: [--ict-head-size ICT_HEAD_SIZE] +[default7]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default7]: [--biencoder-shared-query-context-model] +[default7]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default7]: [--titles-data-path TITLES_DATA_PATH] +[default7]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default7]: [--use-one-sent-docs] +[default7]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default7]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default7]: [--retriever-score-scaling] +[default7]: [--block-data-path BLOCK_DATA_PATH] +[default7]: [--embedding-path EMBEDDING_PATH] +[default7]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default7]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default7]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default7]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default7]: [--log-params-norm] [--log-num-zeros-in-grad] +[default7]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default7]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default7]: [--log-timers-to-tensorboard] +[default7]: [--log-batch-size-to-tensorboard] +[default7]: [--no-log-learnig-rate-to-tensorboard] +[default7]: [--no-log-loss-scale-to-tensorboard] +[default7]: [--log-validation-ppl-to-tensorboard] +[default7]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default7]: [--zero-contigious-gradients] +[default7]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default7]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default7]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default7]: [--scattered-embeddings] [--split-transformers] +[default7]: [--memory-centric-tiled-linear] +[default7]: [--tile-factor TILE_FACTOR] +[default7]: [--deepspeed-activation-checkpointing] +[default7]: [--partition-activations] [--contigious-checkpointing] +[default7]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default7]: [--profile-backward] [--deepspeed] +[default7]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default7]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default7]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default5]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default5]: [--hidden-size HIDDEN_SIZE] +[default5]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default5]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default5]: [--kv-channels KV_CHANNELS] +[default5]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default5]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default5]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default5]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default5]: [--sync-tp-duplicated-parameters] +[default5]: [--apply-residual-connection-post-layernorm] +[default5]: [--embed-layernorm] [--openai-gelu] +[default5]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default5]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default5]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default5]: [--kill-switch-path KILL_SWITCH_PATH] +[default5]: [--log-level {debug,info,warning,error,critical}] +[default5]: [--log-level-replica {debug,info,warning,error,critical}] +[default5]: [--attention-dropout ATTENTION_DROPOUT] +[default5]: [--hidden-dropout HIDDEN_DROPOUT] +[default5]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default5]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default5]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default5]: [--micro-batch-size MICRO_BATCH_SIZE] +[default5]: [--batch-size BATCH_SIZE] +[default5]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default5]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default5]: [--checkpoint-activations] +[default5]: [--distribute-checkpointed-activations] +[default5]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default5]: [--train-iters TRAIN_ITERS] +[default5]: [--train-samples TRAIN_SAMPLES] +[default5]: [--train-tokens TRAIN_TOKENS] +[default5]: [--log-interval LOG_INTERVAL] +[default5]: [--exit-interval EXIT_INTERVAL] +[default5]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default5]: [--tensorboard-dir TENSORBOARD_DIR] +[default5]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default5]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default5]: [--use-bnb-optimizer] +[default5]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default5]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default5]: [--eval-only EVAL_ONLY] +[default5]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default5]: [--inference] +[default5]: [--abort-on-unmet-fused-kernel-constraints] +[default5]: [--pp-partition-method PP_PARTITION_METHOD] +[default5]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default5]: [--init-method-xavier-uniform] [--lr LR] +[default5]: [--lr-decay-style {constant,linear,cosine}] +[default5]: [--lr-decay-iters LR_DECAY_ITERS] +[default5]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default5]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default5]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default5]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default5]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default5]: [--warmup WARMUP] [--min-lr MIN_LR] +[default5]: [--override-lr-scheduler] +[default5]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default5]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default5]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default5]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default5]: [--loss-scale LOSS_SCALE] +[default5]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default5]: [--min-loss-scale MIN_LOSS_SCALE] +[default5]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default5]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default5]: [--no-query-key-layer-scaling] +[default5]: [--attention-softmax-in-fp32] +[default5]: [--accumulate-allreduce-grads-in-fp32] +[default5]: [--fp16-lm-cross-entropy] +[default5]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default5]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default5]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default5]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default5]: [--distributed-backend {nccl,gloo}] +[default5]: [--DDP-impl {local,torch}] +[default5]: [--use-contiguous-buffers-in-ddp] +[default5]: [--no-scatter-gather-tensors-in-pipeline] +[default5]: [--local_rank LOCAL_RANK] +[default5]: [--lazy-mpu-init LAZY_MPU_INIT] +[default5]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default5]: [--eval-interval EVAL_INTERVAL] +[default5]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default5]: [--split SPLIT] +[default5]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default5]: [--merge-file MERGE_FILE] +[default5]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default5]: [--seq-length SEQ_LENGTH] +[default5]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default5]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default5]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default5]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default5]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default5]: [--num-workers NUM_WORKERS] +[default5]: [--valid-num-workers VALID_NUM_WORKERS] +[default5]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default5]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default5]: [--data-impl {lazy,cached,mmap,infer}] +[default5]: [--reset-position-ids] [--reset-attention-mask] +[default5]: [--eod-mask-loss] [--loss-on-targets-only] +[default5]: [--norm-target-loss] +[default5]: [--reweight-loss-based-on-position-frequency] +[default5]: [--noise-density NOISE_DENSITY] +[default5]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default5]: [--prefixlm] [--adlr-autoresume] +[default5]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default5]: [--ict-head-size ICT_HEAD_SIZE] +[default5]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default5]: [--biencoder-shared-query-context-model] +[default5]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default5]: [--titles-data-path TITLES_DATA_PATH] +[default5]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default5]: [--use-one-sent-docs] +[default5]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default5]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default5]: [--retriever-score-scaling] +[default5]: [--block-data-path BLOCK_DATA_PATH] +[default5]: [--embedding-path EMBEDDING_PATH] +[default5]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default5]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default5]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default5]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default5]: [--log-params-norm] [--log-num-zeros-in-grad] +[default5]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default5]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default5]: [--log-timers-to-tensorboard] +[default5]: [--log-batch-size-to-tensorboard] +[default5]: [--no-log-learnig-rate-to-tensorboard] +[default5]: [--no-log-loss-scale-to-tensorboard] +[default5]: [--log-validation-ppl-to-tensorboard] +[default5]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default5]: [--zero-contigious-gradients] +[default5]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default5]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default5]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default5]: [--scattered-embeddings] [--split-transformers] +[default5]: [--memory-centric-tiled-linear] +[default5]: [--tile-factor TILE_FACTOR] +[default5]: [--deepspeed-activation-checkpointing] +[default5]: [--partition-activations] [--contigious-checkpointing] +[default5]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default5]: [--profile-backward] [--deepspeed] +[default5]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default5]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default5]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default0]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default0]: [--hidden-size HIDDEN_SIZE] +[default0]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default0]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default0]: [--kv-channels KV_CHANNELS] +[default0]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default0]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default0]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default0]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default0]: [--sync-tp-duplicated-parameters] +[default0]: [--apply-residual-connection-post-layernorm] +[default0]: [--embed-layernorm] [--openai-gelu] +[default0]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default0]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default0]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default0]: [--kill-switch-path KILL_SWITCH_PATH] +[default0]: [--log-level {debug,info,warning,error,critical}] +[default0]: [--log-level-replica {debug,info,warning,error,critical}] +[default0]: [--attention-dropout ATTENTION_DROPOUT] +[default0]: [--hidden-dropout HIDDEN_DROPOUT] +[default0]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default0]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default0]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default0]: [--micro-batch-size MICRO_BATCH_SIZE] +[default0]: [--batch-size BATCH_SIZE] +[default0]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default0]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default0]: [--checkpoint-activations] +[default0]: [--distribute-checkpointed-activations] +[default0]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default0]: [--train-iters TRAIN_ITERS] +[default0]: [--train-samples TRAIN_SAMPLES] +[default0]: [--train-tokens TRAIN_TOKENS] +[default0]: [--log-interval LOG_INTERVAL] +[default0]: [--exit-interval EXIT_INTERVAL] +[default0]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default0]: [--tensorboard-dir TENSORBOARD_DIR] +[default0]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default0]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default0]: [--use-bnb-optimizer] +[default0]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default0]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default0]: [--eval-only EVAL_ONLY] +[default0]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default0]: [--inference] +[default0]: [--abort-on-unmet-fused-kernel-constraints] +[default0]: [--pp-partition-method PP_PARTITION_METHOD] +[default0]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default0]: [--init-method-xavier-uniform] [--lr LR] +[default0]: [--lr-decay-style {constant,linear,cosine}] +[default0]: [--lr-decay-iters LR_DECAY_ITERS] +[default0]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default0]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default0]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default0]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default0]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default0]: [--warmup WARMUP] [--min-lr MIN_LR] +[default0]: [--override-lr-scheduler] +[default0]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default0]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default0]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default0]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default0]: [--loss-scale LOSS_SCALE] +[default0]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default0]: [--min-loss-scale MIN_LOSS_SCALE] +[default0]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default0]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default0]: [--no-query-key-layer-scaling] +[default0]: [--attention-softmax-in-fp32] +[default0]: [--accumulate-allreduce-grads-in-fp32] +[default0]: [--fp16-lm-cross-entropy] +[default0]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default0]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default0]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default0]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default0]: [--distributed-backend {nccl,gloo}] +[default0]: [--DDP-impl {local,torch}] +[default0]: [--use-contiguous-buffers-in-ddp] +[default0]: [--no-scatter-gather-tensors-in-pipeline] +[default0]: [--local_rank LOCAL_RANK] +[default0]: [--lazy-mpu-init LAZY_MPU_INIT] +[default0]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default0]: [--eval-interval EVAL_INTERVAL] +[default0]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default0]: [--split SPLIT] +[default0]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default0]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default0]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default0]: [--merge-file MERGE_FILE] +[default0]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default0]: [--seq-length SEQ_LENGTH] +[default0]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default0]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default0]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default0]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default0]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default0]: [--num-workers NUM_WORKERS] +[default0]: [--valid-num-workers VALID_NUM_WORKERS] +[default0]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default0]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default0]: [--data-impl {lazy,cached,mmap,infer}] +[default0]: [--reset-position-ids] [--reset-attention-mask] +[default0]: [--eod-mask-loss] [--loss-on-targets-only] +[default0]: [--norm-target-loss] +[default0]: [--reweight-loss-based-on-position-frequency] +[default0]: [--noise-density NOISE_DENSITY] +[default0]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default0]: [--prefixlm] [--adlr-autoresume] +[default0]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default0]: [--ict-head-size ICT_HEAD_SIZE] +[default0]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default0]: [--biencoder-shared-query-context-model] +[default0]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default0]: [--titles-data-path TITLES_DATA_PATH] +[default0]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default0]: [--use-one-sent-docs] +[default0]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default0]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default0]: [--retriever-score-scaling] +[default0]: [--block-data-path BLOCK_DATA_PATH] +[default0]: [--embedding-path EMBEDDING_PATH] +[default0]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default0]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default0]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default0]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default0]: [--log-params-norm] [--log-num-zeros-in-grad] +[default0]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default0]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default0]: [--log-timers-to-tensorboard] +[default0]: [--log-batch-size-to-tensorboard] +[default0]: [--no-log-learnig-rate-to-tensorboard] +[default0]: [--no-log-loss-scale-to-tensorboard] +[default0]: [--log-validation-ppl-to-tensorboard] +[default0]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default0]: [--zero-contigious-gradients] +[default0]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default0]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default0]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default0]: [--scattered-embeddings] [--split-transformers] +[default0]: [--memory-centric-tiled-linear] +[default0]: [--tile-factor TILE_FACTOR] +[default0]: [--deepspeed-activation-checkpointing] +[default0]: [--partition-activations] [--contigious-checkpointing] +[default0]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default0]: [--profile-backward] [--deepspeed] +[default0]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default0]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default0]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default3]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default3]: [--hidden-size HIDDEN_SIZE] +[default3]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default3]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default3]: [--kv-channels KV_CHANNELS] +[default3]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default3]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default3]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default3]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default3]: [--sync-tp-duplicated-parameters] +[default3]: [--apply-residual-connection-post-layernorm] +[default3]: [--embed-layernorm] [--openai-gelu] +[default3]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default3]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default3]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default3]: [--kill-switch-path KILL_SWITCH_PATH] +[default3]: [--log-level {debug,info,warning,error,critical}] +[default3]: [--log-level-replica {debug,info,warning,error,critical}] +[default3]: [--attention-dropout ATTENTION_DROPOUT] +[default3]: [--hidden-dropout HIDDEN_DROPOUT] +[default3]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default3]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default3]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default3]: [--micro-batch-size MICRO_BATCH_SIZE] +[default3]: [--batch-size BATCH_SIZE] +[default3]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default3]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default3]: [--checkpoint-activations] +[default3]: [--distribute-checkpointed-activations] +[default3]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default3]: [--train-iters TRAIN_ITERS] +[default3]: [--train-samples TRAIN_SAMPLES] +[default3]: [--train-tokens TRAIN_TOKENS] +[default3]: [--log-interval LOG_INTERVAL] +[default3]: [--exit-interval EXIT_INTERVAL] +[default3]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default3]: [--tensorboard-dir TENSORBOARD_DIR] +[default3]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default3]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default3]: [--use-bnb-optimizer] +[default3]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default3]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default3]: [--eval-only EVAL_ONLY] +[default3]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default3]: [--inference] +[default3]: [--abort-on-unmet-fused-kernel-constraints] +[default3]: [--pp-partition-method PP_PARTITION_METHOD] +[default3]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default3]: [--init-method-xavier-uniform] [--lr LR] +[default3]: [--lr-decay-style {constant,linear,cosine}] +[default3]: [--lr-decay-iters LR_DECAY_ITERS] +[default3]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default3]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default3]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default3]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default3]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default3]: [--warmup WARMUP] [--min-lr MIN_LR] +[default3]: [--override-lr-scheduler] +[default3]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default3]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default3]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default3]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default3]: [--loss-scale LOSS_SCALE] +[default3]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default3]: [--min-loss-scale MIN_LOSS_SCALE] +[default3]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default3]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default3]: [--no-query-key-layer-scaling] +[default3]: [--attention-softmax-in-fp32] +[default3]: [--accumulate-allreduce-grads-in-fp32] +[default3]: [--fp16-lm-cross-entropy] +[default3]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default3]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default3]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default3]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default3]: [--distributed-backend {nccl,gloo}] +[default3]: [--DDP-impl {local,torch}] +[default3]: [--use-contiguous-buffers-in-ddp] +[default3]: [--no-scatter-gather-tensors-in-pipeline] +[default3]: [--local_rank LOCAL_RANK] +[default3]: [--lazy-mpu-init LAZY_MPU_INIT] +[default3]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default3]: [--eval-interval EVAL_INTERVAL] +[default3]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default3]: [--split SPLIT] +[default3]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default3]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default3]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default3]: [--merge-file MERGE_FILE] +[default3]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default3]: [--seq-length SEQ_LENGTH] +[default3]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default3]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default3]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default3]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default3]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default3]: [--num-workers NUM_WORKERS] +[default3]: [--valid-num-workers VALID_NUM_WORKERS] +[default3]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default3]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default3]: [--data-impl {lazy,cached,mmap,infer}] +[default3]: [--reset-position-ids] [--reset-attention-mask] +[default3]: [--eod-mask-loss] [--loss-on-targets-only] +[default3]: [--norm-target-loss] +[default3]: [--reweight-loss-based-on-position-frequency] +[default3]: [--noise-density NOISE_DENSITY] +[default3]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default3]: [--prefixlm] [--adlr-autoresume] +[default3]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default3]: [--ict-head-size ICT_HEAD_SIZE] +[default3]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default3]: [--biencoder-shared-query-context-model] +[default3]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default3]: [--titles-data-path TITLES_DATA_PATH] +[default3]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default3]: [--use-one-sent-docs] +[default3]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default3]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default3]: [--retriever-score-scaling] +[default3]: [--block-data-path BLOCK_DATA_PATH] +[default3]: [--embedding-path EMBEDDING_PATH] +[default3]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default3]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default3]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default3]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default3]: [--log-params-norm] [--log-num-zeros-in-grad] +[default3]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default3]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default3]: [--log-timers-to-tensorboard] +[default3]: [--log-batch-size-to-tensorboard] +[default3]: [--no-log-learnig-rate-to-tensorboard] +[default3]: [--no-log-loss-scale-to-tensorboard] +[default3]: [--log-validation-ppl-to-tensorboard] +[default3]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default3]: [--zero-contigious-gradients] +[default3]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default3]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default3]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default3]: [--scattered-embeddings] [--split-transformers] +[default3]: [--memory-centric-tiled-linear] +[default3]: [--tile-factor TILE_FACTOR] +[default3]: [--deepspeed-activation-checkpointing] +[default3]: [--partition-activations] [--contigious-checkpointing] +[default3]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default3]: [--profile-backward] [--deepspeed] +[default3]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default3]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default3]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default2]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default2]: [--hidden-size HIDDEN_SIZE] +[default2]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default2]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default2]: [--kv-channels KV_CHANNELS] +[default2]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default2]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default2]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default2]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default2]: [--sync-tp-duplicated-parameters] +[default2]: [--apply-residual-connection-post-layernorm] +[default2]: [--embed-layernorm] [--openai-gelu] +[default2]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default2]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default2]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default2]: [--kill-switch-path KILL_SWITCH_PATH] +[default2]: [--log-level {debug,info,warning,error,critical}] +[default2]: [--log-level-replica {debug,info,warning,error,critical}] +[default2]: [--attention-dropout ATTENTION_DROPOUT] +[default2]: [--hidden-dropout HIDDEN_DROPOUT] +[default2]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default2]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default2]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default2]: [--micro-batch-size MICRO_BATCH_SIZE] +[default2]: [--batch-size BATCH_SIZE] +[default2]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default2]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default2]: [--checkpoint-activations] +[default2]: [--distribute-checkpointed-activations] +[default2]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default2]: [--train-iters TRAIN_ITERS] +[default2]: [--train-samples TRAIN_SAMPLES] +[default2]: [--train-tokens TRAIN_TOKENS] +[default2]: [--log-interval LOG_INTERVAL] +[default2]: [--exit-interval EXIT_INTERVAL] +[default2]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default2]: [--tensorboard-dir TENSORBOARD_DIR] +[default2]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default2]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default2]: [--use-bnb-optimizer] +[default2]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default2]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default2]: [--eval-only EVAL_ONLY] +[default2]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default2]: [--inference] +[default2]: [--abort-on-unmet-fused-kernel-constraints] +[default2]: [--pp-partition-method PP_PARTITION_METHOD] +[default2]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default2]: [--init-method-xavier-uniform] [--lr LR] +[default2]: [--lr-decay-style {constant,linear,cosine}] +[default2]: [--lr-decay-iters LR_DECAY_ITERS] +[default2]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default2]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default2]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default2]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default2]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default2]: [--warmup WARMUP] [--min-lr MIN_LR] +[default2]: [--override-lr-scheduler] +[default2]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default2]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default2]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default2]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default2]: [--loss-scale LOSS_SCALE] +[default2]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default2]: [--min-loss-scale MIN_LOSS_SCALE] +[default2]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default2]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default2]: [--no-query-key-layer-scaling] +[default2]: [--attention-softmax-in-fp32] +[default2]: [--accumulate-allreduce-grads-in-fp32] +[default2]: [--fp16-lm-cross-entropy] +[default2]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default2]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default2]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default2]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default2]: [--distributed-backend {nccl,gloo}] +[default2]: [--DDP-impl {local,torch}] +[default2]: [--use-contiguous-buffers-in-ddp] +[default2]: [--no-scatter-gather-tensors-in-pipeline] +[default2]: [--local_rank LOCAL_RANK] +[default2]: [--lazy-mpu-init LAZY_MPU_INIT] +[default2]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default2]: [--eval-interval EVAL_INTERVAL] +[default2]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default2]: [--split SPLIT] +[default2]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default2]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default2]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default2]: [--merge-file MERGE_FILE] +[default2]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default2]: [--seq-length SEQ_LENGTH] +[default2]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default2]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default2]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default2]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default2]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default2]: [--num-workers NUM_WORKERS] +[default2]: [--valid-num-workers VALID_NUM_WORKERS] +[default2]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default2]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default2]: [--data-impl {lazy,cached,mmap,infer}] +[default2]: [--reset-position-ids] [--reset-attention-mask] +[default2]: [--eod-mask-loss] [--loss-on-targets-only] +[default2]: [--norm-target-loss] +[default2]: [--reweight-loss-based-on-position-frequency] +[default2]: [--noise-density NOISE_DENSITY] +[default2]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default2]: [--prefixlm] [--adlr-autoresume] +[default2]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default7]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default7]: [--hidden-size HIDDEN_SIZE] +[default7]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default7]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default7]: [--kv-channels KV_CHANNELS] +[default7]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default4]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default4]: [--hidden-size HIDDEN_SIZE] +[default4]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default4]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default4]: [--kv-channels KV_CHANNELS] +[default4]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default4]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default4]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default4]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default4]: [--sync-tp-duplicated-parameters] +[default4]: [--apply-residual-connection-post-layernorm] +[default4]: [--embed-layernorm] [--openai-gelu] +[default4]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default4]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default4]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default4]: [--kill-switch-path KILL_SWITCH_PATH] +[default7]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default7]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default7]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default7]: [--sync-tp-duplicated-parameters] +[default7]: [--apply-residual-connection-post-layernorm] +[default4]: [--log-level {debug,info,warning,error,critical}] +[default4]: [--log-level-replica {debug,info,warning,error,critical}] +[default4]: [--attention-dropout ATTENTION_DROPOUT] +[default4]: [--hidden-dropout HIDDEN_DROPOUT] +[default4]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default4]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default4]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default4]: [--micro-batch-size MICRO_BATCH_SIZE] +[default4]: [--batch-size BATCH_SIZE] +[default4]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default4]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default4]: [--checkpoint-activations] +[default4]: [--distribute-checkpointed-activations] +[default4]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default4]: [--train-iters TRAIN_ITERS] +[default4]: [--train-samples TRAIN_SAMPLES] +[default4]: [--train-tokens TRAIN_TOKENS] +[default4]: [--log-interval LOG_INTERVAL] +[default4]: [--exit-interval EXIT_INTERVAL] +[default4]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default4]: [--tensorboard-dir TENSORBOARD_DIR] +[default4]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default4]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default4]: [--use-bnb-optimizer] +[default4]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default4]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default4]: [--eval-only EVAL_ONLY] +[default4]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default4]: [--inference] +[default4]: [--abort-on-unmet-fused-kernel-constraints] +[default4]: [--pp-partition-method PP_PARTITION_METHOD] +[default4]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default4]: [--init-method-xavier-uniform] [--lr LR] +[default4]: [--lr-decay-style {constant,linear,cosine}] +[default4]: [--lr-decay-iters LR_DECAY_ITERS] +[default4]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default4]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default4]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default4]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default4]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default4]: [--warmup WARMUP] [--min-lr MIN_LR] +[default4]: [--override-lr-scheduler] +[default4]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default4]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default4]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default4]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default4]: [--loss-scale LOSS_SCALE] +[default4]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default4]: [--min-loss-scale MIN_LOSS_SCALE] +[default4]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default4]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default4]: [--no-query-key-layer-scaling] +[default4]: [--attention-softmax-in-fp32] +[default4]: [--accumulate-allreduce-grads-in-fp32] +[default4]: [--fp16-lm-cross-entropy] +[default4]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default4]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default4]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default4]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default4]: [--distributed-backend {nccl,gloo}] +[default4]: [--DDP-impl {local,torch}] +[default4]: [--use-contiguous-buffers-in-ddp] +[default4]: [--no-scatter-gather-tensors-in-pipeline] +[default4]: [--local_rank LOCAL_RANK] +[default4]: [--lazy-mpu-init LAZY_MPU_INIT] +[default4]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default4]: [--eval-interval EVAL_INTERVAL] +[default4]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default4]: [--split SPLIT] +[default4]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default4]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default4]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default4]: [--merge-file MERGE_FILE] +[default4]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default4]: [--seq-length SEQ_LENGTH] +[default4]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default4]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default4]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default4]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default4]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default4]: [--num-workers NUM_WORKERS] +[default4]: [--valid-num-workers VALID_NUM_WORKERS] +[default4]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default4]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default4]: [--data-impl {lazy,cached,mmap,infer}] +[default4]: [--reset-position-ids] [--reset-attention-mask] +[default4]: [--eod-mask-loss] [--loss-on-targets-only] +[default4]: [--norm-target-loss] +[default4]: [--reweight-loss-based-on-position-frequency] +[default4]: [--noise-density NOISE_DENSITY] +[default4]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default4]: [--prefixlm] [--adlr-autoresume] +[default4]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default4]: [--ict-head-size ICT_HEAD_SIZE] +[default4]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default4]: [--biencoder-shared-query-context-model] +[default4]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default4]: [--titles-data-path TITLES_DATA_PATH] +[default4]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default4]: [--use-one-sent-docs] +[default4]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default4]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default4]: [--retriever-score-scaling] +[default4]: [--block-data-path BLOCK_DATA_PATH] +[default4]: [--embedding-path EMBEDDING_PATH] +[default4]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default4]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default4]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default4]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default4]: [--log-params-norm] [--log-num-zeros-in-grad] +[default4]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default4]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default4]: [--log-timers-to-tensorboard] +[default4]: [--log-batch-size-to-tensorboard] +[default4]: [--no-log-learnig-rate-to-tensorboard] +[default4]: [--no-log-loss-scale-to-tensorboard] +[default4]: [--log-validation-ppl-to-tensorboard] +[default4]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default4]: [--zero-contigious-gradients] +[default4]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default4]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default4]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default4]: [--scattered-embeddings] [--split-transformers] +[default4]: [--memory-centric-tiled-linear] +[default4]: [--tile-factor TILE_FACTOR] +[default4]: [--deepspeed-activation-checkpointing] +[default4]: [--partition-activations] [--contigious-checkpointing] +[default4]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default4]: [--profile-backward] [--deepspeed] +[default4]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default4]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default4]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default7]: [--embed-layernorm] [--openai-gelu] +[default7]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default1]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default7]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default7]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default1]: [--hidden-size HIDDEN_SIZE] +[default2]: [--ict-head-size ICT_HEAD_SIZE] +[default2]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default2]: [--biencoder-shared-query-context-model] +[default2]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default2]: [--titles-data-path TITLES_DATA_PATH] +[default7]: [--kill-switch-path KILL_SWITCH_PATH] +[default6]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default6]: [--hidden-size HIDDEN_SIZE] +[default7]: [--log-level {debug,info,warning,error,critical}] +[default2]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default2]: [--use-one-sent-docs] +[default2]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default2]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default2]: [--retriever-score-scaling] +[default2]: [--block-data-path BLOCK_DATA_PATH] +[default2]: [--embedding-path EMBEDDING_PATH] +[default1]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default1]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default1]: [--kv-channels KV_CHANNELS] +[default1]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default1]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default1]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default1]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default6]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default6]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default7]: [--log-level-replica {debug,info,warning,error,critical}] +[default6]: [--kv-channels KV_CHANNELS] +[default2]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default2]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default7]: [--attention-dropout ATTENTION_DROPOUT] +[default7]: [--hidden-dropout HIDDEN_DROPOUT] +[default7]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default2]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default2]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default6]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default6]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default7]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default7]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default7]: [--micro-batch-size MICRO_BATCH_SIZE] +[default5]:usage: finetune_t0.py [-h] [--num-layers NUM_LAYERS] +[default5]: [--hidden-size HIDDEN_SIZE] +[default5]: [--ffn-hidden-size FFN_HIDDEN_SIZE] +[default5]: [--num-attention-heads NUM_ATTENTION_HEADS] +[default5]: [--kv-channels KV_CHANNELS] +[default5]: [--max-position-embeddings MAX_POSITION_EMBEDDINGS] +[default2]: [--log-params-norm] [--log-num-zeros-in-grad] +[default2]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default2]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default2]: [--log-timers-to-tensorboard] +[default5]: [--make-vocab-size-divisible-by MAKE_VOCAB_SIZE_DIVISIBLE_BY] +[default5]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default5]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default6]: [--pad-vocab-size-to PAD_VOCAB_SIZE_TO] +[default2]: [--log-batch-size-to-tensorboard] +[default7]: [--batch-size BATCH_SIZE] +[default7]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default1]: [--sync-tp-duplicated-parameters] +[default7]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default5]: [--sync-tp-duplicated-parameters] +[default5]: [--apply-residual-connection-post-layernorm] +[default5]: [--embed-layernorm] [--openai-gelu] +[default5]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default5]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default5]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default5]: [--kill-switch-path KILL_SWITCH_PATH] +[default5]: [--log-level {debug,info,warning,error,critical}] +[default7]: [--checkpoint-activations] +[default1]: [--apply-residual-connection-post-layernorm] +[default2]: [--no-log-learnig-rate-to-tensorboard] +[default2]: [--no-log-loss-scale-to-tensorboard] +[default2]: [--log-validation-ppl-to-tensorboard] +[default6]: [--layernorm-epsilon LAYERNORM_EPSILON] +[default6]: [--sync-tp-duplicated-parameters] +[default7]: [--distribute-checkpointed-activations] +[default5]: [--log-level-replica {debug,info,warning,error,critical}] +[default5]: [--attention-dropout ATTENTION_DROPOUT] +[default5]: [--hidden-dropout HIDDEN_DROPOUT] +[default1]: [--embed-layernorm] [--openai-gelu] +[default2]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default2]: [--zero-contigious-gradients] +[default2]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default2]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default2]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default6]: [--apply-residual-connection-post-layernorm] +[default1]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default2]: [--scattered-embeddings] [--split-transformers] +[default6]: [--embed-layernorm] [--openai-gelu] +[default6]: [--onnx-safe ONNX_SAFE] [--bert-no-binary-head] +[default6]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default6]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default6]: [--kill-switch-path KILL_SWITCH_PATH] +[default5]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default5]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default5]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default5]: [--micro-batch-size MICRO_BATCH_SIZE] +[default7]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default7]: [--train-iters TRAIN_ITERS] +[default2]: [--memory-centric-tiled-linear] +[default2]: [--tile-factor TILE_FACTOR] +[default2]: [--deepspeed-activation-checkpointing] +[default7]: [--train-samples TRAIN_SAMPLES] +[default7]: [--train-tokens TRAIN_TOKENS] +[default7]: [--log-interval LOG_INTERVAL] +[default7]: [--exit-interval EXIT_INTERVAL] +[default7]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default5]: [--batch-size BATCH_SIZE] +[default5]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default5]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default5]: [--checkpoint-activations] +[default5]: [--distribute-checkpointed-activations] +[default2]: [--partition-activations] [--contigious-checkpointing] +[default2]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default2]: [--profile-backward] [--deepspeed] +[default2]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default2]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default2]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default7]: [--tensorboard-dir TENSORBOARD_DIR] +[default7]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default7]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default7]: [--use-bnb-optimizer] +[default5]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default5]: [--train-iters TRAIN_ITERS] +[default5]: [--train-samples TRAIN_SAMPLES] +[default5]: [--train-tokens TRAIN_TOKENS] +[default1]: [--position-embedding-type {PositionEmbeddingType.rotary,PositionEmbeddingType.absolute,PositionEmbeddingType.alibi}] +[default7]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default7]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default1]: [--glu-activation {geglu,liglu,reglu,swiglu}] +[default1]: [--kill-switch-path KILL_SWITCH_PATH] +[default1]: [--log-level {debug,info,warning,error,critical}] +[default1]: [--log-level-replica {debug,info,warning,error,critical}] +[default7]: [--eval-only EVAL_ONLY] +[default1]: [--attention-dropout ATTENTION_DROPOUT] +[default1]: [--hidden-dropout HIDDEN_DROPOUT] +[default7]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default7]: [--inference] +[default7]: [--abort-on-unmet-fused-kernel-constraints] +[default6]: [--log-level {debug,info,warning,error,critical}] +[default6]: [--log-level-replica {debug,info,warning,error,critical}] +[default6]: [--attention-dropout ATTENTION_DROPOUT] +[default1]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default5]: [--log-interval LOG_INTERVAL] +[default7]: [--pp-partition-method PP_PARTITION_METHOD] +[default7]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default7]: [--init-method-xavier-uniform] [--lr LR] +[default7]: [--lr-decay-style {constant,linear,cosine}] +[default7]: [--lr-decay-iters LR_DECAY_ITERS] +[default7]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default1]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default1]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default1]: [--micro-batch-size MICRO_BATCH_SIZE] +[default6]: [--hidden-dropout HIDDEN_DROPOUT] +[default6]: [--weight-decay WEIGHT_DECAY] [--clip-grad CLIP_GRAD] +[default6]: [--adam-beta1 ADAM_BETA1] [--adam-beta2 ADAM_BETA2] +[default6]: [--adam-eps ADAM_EPS] [--sgd-momentum SGD_MOMENTUM] +[default6]: [--micro-batch-size MICRO_BATCH_SIZE] +[default1]: [--batch-size BATCH_SIZE] +[default6]: [--batch-size BATCH_SIZE] +[default6]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default6]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default6]: [--checkpoint-activations] +[default6]: [--distribute-checkpointed-activations] +[default7]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default7]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default5]: [--exit-interval EXIT_INTERVAL] +[default5]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default5]: [--tensorboard-dir TENSORBOARD_DIR] +[default5]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default1]: [--global-batch-size GLOBAL_BATCH_SIZE] +[default1]: [--rampup-batch-size [RAMPUP_BATCH_SIZE [RAMPUP_BATCH_SIZE ...]]] +[default1]: [--checkpoint-activations] +[default7]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default7]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default5]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default5]: [--use-bnb-optimizer] +[default7]: [--warmup WARMUP] [--min-lr MIN_LR] +[default7]: [--override-lr-scheduler] +[default5]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default1]: [--distribute-checkpointed-activations] +[default1]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default1]: [--train-iters TRAIN_ITERS] +[default7]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default7]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default7]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default1]: [--train-samples TRAIN_SAMPLES] +[default1]: [--train-tokens TRAIN_TOKENS] +[default7]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default1]: [--log-interval LOG_INTERVAL] +[default7]: [--loss-scale LOSS_SCALE] +[default1]: [--exit-interval EXIT_INTERVAL] +[default6]: [--checkpoint-num-layers CHECKPOINT_NUM_LAYERS] +[default1]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default7]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default5]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default6]: [--train-iters TRAIN_ITERS] +[default6]: [--train-samples TRAIN_SAMPLES] +[default5]: [--eval-only EVAL_ONLY] +[default5]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default5]: [--inference] +[default5]: [--abort-on-unmet-fused-kernel-constraints] +[default5]: [--pp-partition-method PP_PARTITION_METHOD] +[default5]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default6]: [--train-tokens TRAIN_TOKENS] +[default5]: [--init-method-xavier-uniform] [--lr LR] +[default1]: [--tensorboard-dir TENSORBOARD_DIR] +[default1]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default1]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default1]: [--use-bnb-optimizer] +[default6]: [--log-interval LOG_INTERVAL] +[default7]: [--min-loss-scale MIN_LOSS_SCALE] +[default7]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default7]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default1]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default1]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default1]: [--eval-only EVAL_ONLY] +[default7]: [--no-query-key-layer-scaling] +[default7]: [--attention-softmax-in-fp32] +[default1]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default6]: [--exit-interval EXIT_INTERVAL] +[default1]: [--inference] +[default7]: [--accumulate-allreduce-grads-in-fp32] +[default7]: [--fp16-lm-cross-entropy] +[default7]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default5]: [--lr-decay-style {constant,linear,cosine}] +[default1]: [--abort-on-unmet-fused-kernel-constraints] +[default7]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default7]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default1]: [--pp-partition-method PP_PARTITION_METHOD] +[default1]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default5]: [--lr-decay-iters LR_DECAY_ITERS] +[default5]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default1]: [--init-method-xavier-uniform] [--lr LR] +[default7]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default1]: [--lr-decay-style {constant,linear,cosine}] +[default1]: [--lr-decay-iters LR_DECAY_ITERS] +[default5]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default7]: [--distributed-backend {nccl,gloo}] +[default7]: [--DDP-impl {local,torch}] +[default7]: [--use-contiguous-buffers-in-ddp] +[default1]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default7]: [--no-scatter-gather-tensors-in-pipeline] +[default7]: [--local_rank LOCAL_RANK] +[default1]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default6]: [--exit-duration-in-mins EXIT_DURATION_IN_MINS] +[default1]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default7]: [--lazy-mpu-init LAZY_MPU_INIT] +[default6]: [--tensorboard-dir TENSORBOARD_DIR] +[default6]: [--no-masked-softmax-fusion] [--no-bias-gelu-fusion] +[default6]: [--no-bias-dropout-fusion] [--optimizer {adam,sgd}] +[default6]: [--use-bnb-optimizer] +[default7]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default7]: [--eval-interval EVAL_INTERVAL] +[default1]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default7]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default6]: [--dataloader-type {single,cyclic}] [--cpu-optimizer] +[default1]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default5]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default5]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default5]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default6]: [--cpu_torch_adam] [--codecarbon-dir CODECARBON_DIR] +[default6]: [--eval-only EVAL_ONLY] +[default5]: [--warmup WARMUP] [--min-lr MIN_LR] +[default6]: [--skip-train-iteration-range SKIP_TRAIN_ITERATION_RANGE [SKIP_TRAIN_ITERATION_RANGE ...]] +[default6]: [--inference] +[default6]: [--abort-on-unmet-fused-kernel-constraints] +[default5]: [--override-lr-scheduler] +[default5]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default5]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default7]: [--split SPLIT] +[default6]: [--pp-partition-method PP_PARTITION_METHOD] +[default5]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default5]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default7]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--seed SEED] [--init-method-std INIT_METHOD_STD] +[default7]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default7]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default1]: [--warmup WARMUP] [--min-lr MIN_LR] +[default1]: [--override-lr-scheduler] +[default7]: [--merge-file MERGE_FILE] +[default7]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default7]: [--seq-length SEQ_LENGTH] +[default7]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default7]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default6]: [--init-method-xavier-uniform] [--lr LR] +[default6]: [--lr-decay-style {constant,linear,cosine}] +[default1]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default1]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default1]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default1]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default7]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default6]: [--lr-decay-iters LR_DECAY_ITERS] +[default6]: [--lr-decay-samples LR_DECAY_SAMPLES] +[default7]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default7]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default6]: [--lr-decay-tokens LR_DECAY_TOKENS] +[default1]: [--loss-scale LOSS_SCALE] +[default7]: [--num-workers NUM_WORKERS] +[default7]: [--valid-num-workers VALID_NUM_WORKERS] +[default6]: [--lr-warmup-fraction LR_WARMUP_FRACTION] +[default6]: [--lr-warmup-iters LR_WARMUP_ITERS] +[default6]: [--lr-warmup-samples LR_WARMUP_SAMPLES] +[default5]: [--loss-scale LOSS_SCALE] +[default5]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default5]: [--min-loss-scale MIN_LOSS_SCALE] +[default5]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default6]: [--warmup WARMUP] [--min-lr MIN_LR] +[default5]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default7]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default1]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default6]: [--override-lr-scheduler] +[default7]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default7]: [--data-impl {lazy,cached,mmap,infer}] +[default7]: [--reset-position-ids] [--reset-attention-mask] +[default7]: [--eod-mask-loss] [--loss-on-targets-only] +[default1]: [--min-loss-scale MIN_LOSS_SCALE] +[default1]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default6]: [--use-checkpoint-lr-scheduler] [--save SAVE] +[default6]: [--save-interval SAVE_INTERVAL] [--no-save-optim] +[default6]: [--no-save-rng] [--load LOAD] [--no-load-optim] +[default6]: [--no-load-rng] [--finetune] [--fp16] [--bf16] +[default6]: [--loss-scale LOSS_SCALE] +[default7]: [--norm-target-loss] +[default6]: [--initial-loss-scale INITIAL_LOSS_SCALE] +[default5]: [--no-query-key-layer-scaling] +[default6]: [--min-loss-scale MIN_LOSS_SCALE] +[default1]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default1]: [--no-query-key-layer-scaling] +[default6]: [--loss-scale-window LOSS_SCALE_WINDOW] +[default5]: [--attention-softmax-in-fp32] +[default1]: [--attention-softmax-in-fp32] +[default6]: [--hysteresis HYSTERESIS] [--fp32-residual-connection] +[default6]: [--no-query-key-layer-scaling] +[default6]: [--attention-softmax-in-fp32] +[default6]: [--accumulate-allreduce-grads-in-fp32] +[default6]: [--fp16-lm-cross-entropy] +[default6]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default5]: [--accumulate-allreduce-grads-in-fp32] +[default5]: [--fp16-lm-cross-entropy] +[default5]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default1]: [--accumulate-allreduce-grads-in-fp32] +[default7]: [--reweight-loss-based-on-position-frequency] +[default1]: [--fp16-lm-cross-entropy] +[default1]: [--tensor-model-parallel-size TENSOR_MODEL_PARALLEL_SIZE] +[default7]: [--noise-density NOISE_DENSITY] +[default6]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default1]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default1]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default6]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default6]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default7]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default6]: [--distributed-backend {nccl,gloo}] +[default6]: [--DDP-impl {local,torch}] +[default6]: [--use-contiguous-buffers-in-ddp] +[default6]: [--no-scatter-gather-tensors-in-pipeline] +[default5]: [--pipeline-model-parallel-size PIPELINE_MODEL_PARALLEL_SIZE] +[default5]: [--model-parallel-size MODEL_PARALLEL_SIZE] +[default5]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default6]: [--local_rank LOCAL_RANK] +[default6]: [--lazy-mpu-init LAZY_MPU_INIT] +[default6]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default6]: [--eval-interval EVAL_INTERVAL] +[default6]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default6]: [--split SPLIT] +[default6]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default7]: [--prefixlm] [--adlr-autoresume] +[default5]: [--distributed-backend {nccl,gloo}] +[default5]: [--DDP-impl {local,torch}] +[default7]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default1]: [--num-layers-per-virtual-pipeline-stage NUM_LAYERS_PER_VIRTUAL_PIPELINE_STAGE] +[default1]: [--distributed-backend {nccl,gloo}] +[default5]: [--use-contiguous-buffers-in-ddp] +[default5]: [--no-scatter-gather-tensors-in-pipeline] +[default7]: [--ict-head-size ICT_HEAD_SIZE] +[default7]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default7]: [--biencoder-shared-query-context-model] +[default7]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default1]: [--DDP-impl {local,torch}] +[default1]: [--use-contiguous-buffers-in-ddp] +[default6]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--no-scatter-gather-tensors-in-pipeline] +[default1]: [--local_rank LOCAL_RANK] +[default1]: [--lazy-mpu-init LAZY_MPU_INIT] +[default6]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default6]: [--merge-file MERGE_FILE] +[default7]: [--titles-data-path TITLES_DATA_PATH] +[default7]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default6]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default6]: [--seq-length SEQ_LENGTH] +[default7]: [--use-one-sent-docs] +[default7]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default7]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default1]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default1]: [--eval-interval EVAL_INTERVAL] +[default1]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default1]: [--split SPLIT] +[default5]: [--local_rank LOCAL_RANK] +[default7]: [--retriever-score-scaling] +[default7]: [--block-data-path BLOCK_DATA_PATH] +[default7]: [--embedding-path EMBEDDING_PATH] +[default7]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default7]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default7]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default7]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default1]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default5]: [--lazy-mpu-init LAZY_MPU_INIT] +[default5]: [--use-cpu-initialization] [--eval-iters EVAL_ITERS] +[default5]: [--eval-interval EVAL_INTERVAL] +[default1]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default1]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--data-path [DATA_PATH [DATA_PATH ...]]] +[default5]: [--split SPLIT] +[default1]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default6]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default5]: [--train-weighted-split-paths [TRAIN_WEIGHTED_SPLIT_PATHS [TRAIN_WEIGHTED_SPLIT_PATHS ...]]] +[default1]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default1]: [--merge-file MERGE_FILE] +[default1]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default6]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default6]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default6]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default5]: [--valid-weighted-split-paths [VALID_WEIGHTED_SPLIT_PATHS [VALID_WEIGHTED_SPLIT_PATHS ...]]] +[default5]: [--test-weighted-split-paths [TEST_WEIGHTED_SPLIT_PATHS [TEST_WEIGHTED_SPLIT_PATHS ...]]] +[default6]: [--num-workers NUM_WORKERS] +[default5]: [--train-weighted-split-paths-path TRAIN_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--valid-weighted-split-paths-path VALID_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--test-weighted-split-paths-path TEST_WEIGHTED_SPLIT_PATHS_PATH] +[default5]: [--log-path LOG_PATH] [--vocab-file VOCAB_FILE] +[default7]: [--log-params-norm] [--log-num-zeros-in-grad] +[default6]: [--valid-num-workers VALID_NUM_WORKERS] +[default5]: [--merge-file MERGE_FILE] +[default5]: [--vocab-extra-ids VOCAB_EXTRA_IDS] +[default5]: [--seq-length SEQ_LENGTH] +[default5]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default7]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default7]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default7]: [--log-timers-to-tensorboard] +[default5]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default1]: [--seq-length SEQ_LENGTH] +[default5]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default1]: [--encoder-seq-length ENCODER_SEQ_LENGTH] +[default1]: [--decoder-seq-length DECODER_SEQ_LENGTH] +[default1]: [--retriever-seq-length RETRIEVER_SEQ_LENGTH] +[default1]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default5]: [--sample-rate SAMPLE_RATE] [--mask-prob MASK_PROB] +[default5]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default6]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default6]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default6]: [--data-impl {lazy,cached,mmap,infer}] +[default1]: [--short-seq-prob SHORT_SEQ_PROB] [--mmap-warmup] +[default7]: [--log-batch-size-to-tensorboard] +[default7]: [--no-log-learnig-rate-to-tensorboard] +[default7]: [--no-log-loss-scale-to-tensorboard] +[default7]: [--log-validation-ppl-to-tensorboard] +[default7]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default7]: [--zero-contigious-gradients] +[default6]: [--reset-position-ids] [--reset-attention-mask] +[default7]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default6]: [--eod-mask-loss] [--loss-on-targets-only] +[default6]: [--norm-target-loss] +[default7]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default6]: [--reweight-loss-based-on-position-frequency] +[default6]: [--noise-density NOISE_DENSITY] +[default6]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default5]: [--num-workers NUM_WORKERS] +[default7]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default7]: [--scattered-embeddings] [--split-transformers] +[default5]: [--valid-num-workers VALID_NUM_WORKERS] +[default5]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default7]: [--memory-centric-tiled-linear] +[default5]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default7]: [--tile-factor TILE_FACTOR] +[default5]: [--data-impl {lazy,cached,mmap,infer}] +[default6]: [--prefixlm] [--adlr-autoresume] +[default6]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default6]: [--ict-head-size ICT_HEAD_SIZE] +[default6]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default7]: [--deepspeed-activation-checkpointing] +[default1]: [--num-workers NUM_WORKERS] +[default1]: [--valid-num-workers VALID_NUM_WORKERS] +[default1]: [--tokenizer-type {BertWordPieceLowerCase,BertWordPieceCase,GPT2BPETokenizer,PretrainedFromHF}] +[default6]: [--biencoder-shared-query-context-model] +[default7]: [--partition-activations] [--contigious-checkpointing] +[default7]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default7]: [--profile-backward] [--deepspeed] +[default1]: [--tokenizer-name-or-path TOKENIZER_NAME_OR_PATH] +[default5]: [--reset-position-ids] [--reset-attention-mask] +[default1]: [--data-impl {lazy,cached,mmap,infer}] +[default7]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default7]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default6]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default7]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default5]: [--eod-mask-loss] [--loss-on-targets-only] +[default5]: [--norm-target-loss] +[default6]: [--titles-data-path TITLES_DATA_PATH] +[default6]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default1]: [--reset-position-ids] [--reset-attention-mask] +[default1]: [--eod-mask-loss] [--loss-on-targets-only] +[default1]: [--norm-target-loss] +[default5]: [--reweight-loss-based-on-position-frequency] +[default1]: [--reweight-loss-based-on-position-frequency] +[default5]: [--noise-density NOISE_DENSITY] +[default5]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default5]: [--prefixlm] [--adlr-autoresume] +[default1]: [--noise-density NOISE_DENSITY] +[default1]: [--mean-noise-span-length MEAN_NOISE_SPAN_LENGTH] +[default1]: [--prefixlm] [--adlr-autoresume] +[default6]: [--use-one-sent-docs] +[default1]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default1]: [--ict-head-size ICT_HEAD_SIZE] +[default6]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default6]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default6]: [--retriever-score-scaling] +[default1]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default6]: [--block-data-path BLOCK_DATA_PATH] +[default6]: [--embedding-path EMBEDDING_PATH] +[default6]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default5]: [--adlr-autoresume-interval ADLR_AUTORESUME_INTERVAL] +[default5]: [--ict-head-size ICT_HEAD_SIZE] +[default6]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default6]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default6]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default5]: [--biencoder-projection-dim BIENCODER_PROJECTION_DIM] +[default6]: [--log-params-norm] [--log-num-zeros-in-grad] +[default6]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default6]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default1]: [--biencoder-shared-query-context-model] +[default6]: [--log-timers-to-tensorboard] +[default1]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default1]: [--titles-data-path TITLES_DATA_PATH] +[default6]: [--log-batch-size-to-tensorboard] +[default6]: [--no-log-learnig-rate-to-tensorboard] +[default6]: [--no-log-loss-scale-to-tensorboard] +[default1]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default1]: [--use-one-sent-docs] +[default5]: [--biencoder-shared-query-context-model] +[default1]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default1]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default1]: [--retriever-score-scaling] +[default5]: [--ict-load ICT_LOAD] [--bert-load BERT_LOAD] +[default5]: [--titles-data-path TITLES_DATA_PATH] +[default1]: [--block-data-path BLOCK_DATA_PATH] +[default1]: [--embedding-path EMBEDDING_PATH] +[default1]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default5]: [--query-in-block-prob QUERY_IN_BLOCK_PROB] +[default1]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default5]: [--use-one-sent-docs] +[default1]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default5]: [--evidence-data-path EVIDENCE_DATA_PATH] +[default6]: [--log-validation-ppl-to-tensorboard] +[default6]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default6]: [--zero-contigious-gradients] +[default5]: [--retriever-report-topk-accuracies RETRIEVER_REPORT_TOPK_ACCURACIES [RETRIEVER_REPORT_TOPK_ACCURACIES ...]] +[default5]: [--retriever-score-scaling] +[default6]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default6]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default1]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default1]: [--log-params-norm] [--log-num-zeros-in-grad] +[default5]: [--block-data-path BLOCK_DATA_PATH] +[default5]: [--embedding-path EMBEDDING_PATH] +[default5]: [--indexer-batch-size INDEXER_BATCH_SIZE] +[default6]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default6]: [--scattered-embeddings] [--split-transformers] +[default5]: [--indexer-log-interval INDEXER_LOG_INTERVAL] +[default5]: [--num-classes NUM_CLASSES] [--img-dim IMG_DIM] +[default5]: [--num-channels NUM_CHANNELS] [--patch-dim PATCH_DIM] +[default1]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default5]: [--log-params-norm] [--log-num-zeros-in-grad] +[default1]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default1]: [--log-timers-to-tensorboard] +[default6]: [--memory-centric-tiled-linear] +[default6]: [--tile-factor TILE_FACTOR] +[default6]: [--deepspeed-activation-checkpointing] +[default6]: [--partition-activations] [--contigious-checkpointing] +[default1]: [--log-batch-size-to-tensorboard] +[default1]: [--no-log-learnig-rate-to-tensorboard] +[default6]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default6]: [--profile-backward] [--deepspeed] +[default6]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default1]: [--no-log-loss-scale-to-tensorboard] +[default6]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default1]: [--log-validation-ppl-to-tensorboard] +[default1]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default5]: [--tensorboard-log-interval TENSORBOARD_LOG_INTERVAL] +[default5]: [--tensorboard-queue-size TENSORBOARD_QUEUE_SIZE] +[default1]: [--zero-contigious-gradients] +[default1]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default1]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default5]: [--log-timers-to-tensorboard] +[default5]: [--log-batch-size-to-tensorboard] +[default1]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default1]: [--scattered-embeddings] [--split-transformers] +[default1]: [--memory-centric-tiled-linear] +[default5]: [--no-log-learnig-rate-to-tensorboard] +[default5]: [--no-log-loss-scale-to-tensorboard] +[default5]: [--log-validation-ppl-to-tensorboard] +[default6]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default5]: [--zero-stage ZERO_STAGE] [--zero-reduce-scatter] +[default5]: [--zero-contigious-gradients] +[default5]: [--zero-reduce-bucket-size ZERO_REDUCE_BUCKET_SIZE] +[default1]: [--tile-factor TILE_FACTOR] +[default1]: [--deepspeed-activation-checkpointing] +[default5]: [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] +[default5]: [--remote-device {none,cpu,nvme}] [--use-pin-memory] +[default1]: [--partition-activations] [--contigious-checkpointing] +[default5]: [--scattered-embeddings] [--split-transformers] +[default1]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default5]: [--memory-centric-tiled-linear] +[default5]: [--tile-factor TILE_FACTOR] +[default1]: [--profile-backward] [--deepspeed] +[default1]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default5]: [--deepspeed-activation-checkpointing] +[default5]: [--partition-activations] [--contigious-checkpointing] +[default5]: [--checkpoint-in-cpu] [--synchronize-each-layer] +[default1]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default1]:finetune_t0.py: error: unrecognized arguments: --reset-progress +[default5]: [--profile-backward] [--deepspeed] +[default5]: [--deepspeed_config DEEPSPEED_CONFIG] [--deepscale] +[default5]: [--deepscale_config DEEPSCALE_CONFIG] [--deepspeed_mpi] +[default5]:finetune_t0.py: error: unrecognized arguments: --reset-progress +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 351544) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 294200) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 370182) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 351586) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 307784) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 380473) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return _run_code(code, main_globals, None, +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + elastic_launch( + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + main() + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + run(args) + raise ChildFailedError( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam39-ib0 + rank : 9 (local_rank: 1) + exitcode : 2 (pid: 351587) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[2]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam39-ib0 + rank : 10 (local_rank: 2) + exitcode : 2 (pid: 351588) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[3]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam39-ib0 + rank : 11 (local_rank: 3) + exitcode : 2 (pid: 351589) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[4]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam39-ib0 + rank : 12 (local_rank: 4) + exitcode : 2 (pid: 351590) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[5]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam39-ib0 + rank : 13 (local_rank: 5) + exitcode : 2 (pid: 351591) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[6]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam39-ib0 + rank : 14 (local_rank: 6) + exitcode : 2 (pid: 351592) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[7]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam39-ib0 + rank : 15 (local_rank: 7) + exitcode : 2 (pid: 351593) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam39-ib0 + rank : 8 (local_rank: 0) + exitcode : 2 (pid: 351586) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + raise ChildFailedError( +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam40-ib0 + rank : 17 (local_rank: 1) + exitcode : 2 (pid: 351545) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[2]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam40-ib0 + rank : 18 (local_rank: 2) + exitcode : 2 (pid: 351546) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[3]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam40-ib0 + rank : 19 (local_rank: 3) + exitcode : 2 (pid: 351547) + error_file: + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + raise ChildFailedError( + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[4]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam40-ib0 + rank : 20 (local_rank: 4) + exitcode : 2 (pid: 351548) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[5]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam40-ib0 + rank : 21 (local_rank: 5) + exitcode : 2 (pid: 351549) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[6]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam40-ib0 + rank : 22 (local_rank: 6) + exitcode : 2 (pid: 351550) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[7]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam40-ib0 + rank : 23 (local_rank: 7) + exitcode : 2 (pid: 351551) + error_file: + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam40-ib0 + rank : 16 (local_rank: 0) + exitcode : 2 (pid: 351544) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam42-ib0 + rank : 33 (local_rank: 1) + exitcode : 2 (pid: 294201) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[2]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam42-ib0 + rank : 34 (local_rank: 2) + exitcode : 2 (pid: 294202) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[3]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam42-ib0 + rank : 35 (local_rank: 3) + exitcode : 2 (pid: 294203) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[4]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam42-ib0 + rank : 36 (local_rank: 4) + exitcode : 2 (pid: 294204) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[5]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam42-ib0 + rank : 37 (local_rank: 5) + exitcode : 2 (pid: 294205) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[6]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam42-ib0 + rank : 38 (local_rank: 6) + exitcode : 2 (pid: 294206) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[7]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam42-ib0 + rank : 39 (local_rank: 7) + exitcode : 2 (pid: 294207) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam42-ib0 + rank : 32 (local_rank: 0) + exitcode : 2 (pid: 294200) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html + return launch_agent(self._config, self._entrypoint, list(args)) +============================================================ + raise ChildFailedError( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam43-ib0 + rank : 41 (local_rank: 1) + exitcode : 2 (pid: 307785) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[2]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam43-ib0 + rank : 42 (local_rank: 2) + exitcode : 2 (pid: 307786) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[3]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam43-ib0 + rank : 43 (local_rank: 3) + exitcode : 2 (pid: 307787) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[4]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam43-ib0 + rank : 44 (local_rank: 4) + exitcode : 2 (pid: 307788) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[5]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam43-ib0 + rank : 45 (local_rank: 5) + exitcode : 2 (pid: 307789) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[6]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam43-ib0 + rank : 46 (local_rank: 6) + exitcode : 2 (pid: 307790) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[7]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam43-ib0 + rank : 47 (local_rank: 7) + exitcode : 2 (pid: 307791) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam43-ib0 + rank : 40 (local_rank: 0) + exitcode : 2 (pid: 307784) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ + raise ChildFailedError( + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam41-ib0 + rank : 25 (local_rank: 1) + exitcode : 2 (pid: 370183) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[2]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam41-ib0 + rank : 26 (local_rank: 2) + exitcode : 2 (pid: 370184) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[3]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam41-ib0 + rank : 27 (local_rank: 3) + exitcode : 2 (pid: 370185) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[4]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam41-ib0 + rank : 28 (local_rank: 4) + exitcode : 2 (pid: 370186) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[5]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam41-ib0 + rank : 29 (local_rank: 5) + exitcode : 2 (pid: 370187) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[6]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam41-ib0 + rank : 30 (local_rank: 6) + exitcode : 2 (pid: 370188) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[7]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam41-ib0 + rank : 31 (local_rank: 7) + exitcode : 2 (pid: 370189) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam41-ib0 + rank : 24 (local_rank: 0) + exitcode : 2 (pid: 370182) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam48-ib0 + rank : 57 (local_rank: 1) + exitcode : 1 (pid: 178809) + error_file: /tmp/torchelastic_2n3x7po0/none_ub7wt4iz/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[2]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam48-ib0 + rank : 58 (local_rank: 2) + exitcode : 1 (pid: 178810) + error_file: /tmp/torchelastic_2n3x7po0/none_ub7wt4iz/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[3]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam48-ib0 + rank : 59 (local_rank: 3) + exitcode : 1 (pid: 178811) + error_file: /tmp/torchelastic_2n3x7po0/none_ub7wt4iz/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[4]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam48-ib0 + rank : 60 (local_rank: 4) + exitcode : 1 (pid: 178812) + error_file: /tmp/torchelastic_2n3x7po0/none_ub7wt4iz/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[5]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam48-ib0 + rank : 61 (local_rank: 5) + exitcode : 1 (pid: 178813) + error_file: /tmp/torchelastic_2n3x7po0/none_ub7wt4iz/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[6]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam48-ib0 + rank : 62 (local_rank: 6) + exitcode : 1 (pid: 178814) + error_file: /tmp/torchelastic_2n3x7po0/none_ub7wt4iz/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[7]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam48-ib0 + rank : 63 (local_rank: 7) + exitcode : 1 (pid: 178815) + error_file: /tmp/torchelastic_2n3x7po0/none_ub7wt4iz/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam48-ib0 + rank : 56 (local_rank: 0) + exitcode : 1 (pid: 178808) + error_file: /tmp/torchelastic_2n3x7po0/none_ub7wt4iz/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +============================================================ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam38-ib0 + rank : 1 (local_rank: 1) + exitcode : 2 (pid: 380474) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[2]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam38-ib0 + rank : 2 (local_rank: 2) + exitcode : 2 (pid: 380475) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[3]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam38-ib0 + rank : 3 (local_rank: 3) + exitcode : 2 (pid: 380476) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[4]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam38-ib0 + rank : 4 (local_rank: 4) + exitcode : 2 (pid: 380478) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[5]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam38-ib0 + rank : 5 (local_rank: 5) + exitcode : 2 (pid: 380479) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[6]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam38-ib0 + rank : 6 (local_rank: 6) + exitcode : 2 (pid: 380480) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[7]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam38-ib0 + rank : 7 (local_rank: 7) + exitcode : 2 (pid: 380481) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:56:18 + host : jean-zay-iam38-ib0 + rank : 0 (local_rank: 0) + exitcode : 2 (pid: 380473) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam47-ib0 + rank : 49 (local_rank: 1) + exitcode : 1 (pid: 185268) + error_file: /tmp/torchelastic_x6bmkgh2/none_5wd15iu6/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[2]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam47-ib0 + rank : 50 (local_rank: 2) + exitcode : 1 (pid: 185269) + error_file: /tmp/torchelastic_x6bmkgh2/none_5wd15iu6/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[3]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam47-ib0 + rank : 51 (local_rank: 3) + exitcode : 1 (pid: 185270) + error_file: /tmp/torchelastic_x6bmkgh2/none_5wd15iu6/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[4]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam47-ib0 + rank : 52 (local_rank: 4) + exitcode : 1 (pid: 185271) + error_file: /tmp/torchelastic_x6bmkgh2/none_5wd15iu6/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[5]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam47-ib0 + rank : 53 (local_rank: 5) + exitcode : 1 (pid: 185272) + error_file: /tmp/torchelastic_x6bmkgh2/none_5wd15iu6/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[6]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam47-ib0 + rank : 54 (local_rank: 6) + exitcode : 1 (pid: 185273) + error_file: /tmp/torchelastic_x6bmkgh2/none_5wd15iu6/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[7]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam47-ib0 + rank : 55 (local_rank: 7) + exitcode : 1 (pid: 185274) + error_file: /tmp/torchelastic_x6bmkgh2/none_5wd15iu6/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_09:56:12 + host : jean-zay-iam47-ib0 + rank : 48 (local_rank: 0) + exitcode : 1 (pid: 185267) + error_file: /tmp/torchelastic_x6bmkgh2/none_5wd15iu6/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/finetune_t0.py", line 180, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew2/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +============================================================ +srun: error: jean-zay-iam47: task 6: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2095548.0 +slurmstepd: error: *** STEP 2095548.0 ON jean-zay-iam38 CANCELLED AT 2022-10-07T09:56:18 *** +srun: error: jean-zay-iam48: task 7: Exited with exit code 1 +srun: error: jean-zay-iam40: task 2: Exited with exit code 1 +srun: error: jean-zay-iam42: task 4: Exited with exit code 1 +srun: error: jean-zay-iam39: task 1: Exited with exit code 1 +srun: error: jean-zay-iam43: task 5: Exited with exit code 1 +srun: error: jean-zay-iam41: task 3: Exited with exit code 1 +srun: error: jean-zay-iam38: task 0: Exited with exit code 1 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:using world size: 64, data-parallel-size: 16, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]:Offline mode: forcing local_files_only=True +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 16 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2095810.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 8192 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 2048 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 128 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 64 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 128 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-07 12:25:20,763] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default3]:> setting tensorboard ... +[default0]:> initializing tensor model parallel with size 2 +[default0]:> initializing pipeline model parallel with size 2 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-07 12:25:22,937] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.085 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 5.122 seconds +[default0]:time to initialize megatron (seconds): 24.146 +[default0]:[after megatron is initialized] datetime: 2022-10-07 12:25:28 +[default0]:building GPT model ... +[default0]:[2022-10-07 12:25:28,211] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-07 12:25:28,212] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-07 12:25:28,212] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.37 GB, percent = 17.3% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=1, model=0): 2, ProcessCoord(pipe=0, data=1, model=1): 3, ProcessCoord(pipe=0, data=2, model=0): 4, ProcessCoord(pipe=0, data=2, model=1): 5, ProcessCoord(pipe=0, data=3, model=0): 6, ProcessCoord(pipe=0, data=3, model=1): 7, ProcessCoord(pipe=0, data=4, model=0): 8, ProcessCoord(pipe=0, data=4, model=1): 9, ProcessCoord(pipe=0, data=5, model=0): 10, ProcessCoord(pipe=0, data=5, model=1): 11, ProcessCoord(pipe=0, data=6, model=0): 12, ProcessCoord(pipe=0, data=6, model=1): 13, ProcessCoord(pipe=0, data=7, model=0): 14, ProcessCoord(pipe=0, data=7, model=1): 15, ProcessCoord(pipe=0, data=8, model=0): 16, ProcessCoord(pipe=0, data=8, model=1): 17, ProcessCoord(pipe=0, data=9, model=0): 18, ProcessCoord(pipe=0, data=9, model=1): 19, ProcessCoord(pipe=0, data=10, model=0): 20, ProcessCoord(pipe=0, data=10, model=1): 21, ProcessCoord(pipe=0, data=11, model=0): 22, ProcessCoord(pipe=0, data=11, model=1): 23, ProcessCoord(pipe=0, data=12, model=0): 24, ProcessCoord(pipe=0, data=12, model=1): 25, ProcessCoord(pipe=0, data=13, model=0): 26, ProcessCoord(pipe=0, data=13, model=1): 27, ProcessCoord(pipe=0, data=14, model=0): 28, ProcessCoord(pipe=0, data=14, model=1): 29, ProcessCoord(pipe=0, data=15, model=0): 30, ProcessCoord(pipe=0, data=15, model=1): 31, ProcessCoord(pipe=1, data=0, model=0): 32, ProcessCoord(pipe=1, data=0, model=1): 33, ProcessCoord(pipe=1, data=1, model=0): 34, ProcessCoord(pipe=1, data=1, model=1): 35, ProcessCoord(pipe=1, data=2, model=0): 36, ProcessCoord(pipe=1, data=2, model=1): 37, ProcessCoord(pipe=1, data=3, model=0): 38, ProcessCoord(pipe=1, data=3, model=1): 39, ProcessCoord(pipe=1, data=4, model=0): 40, ProcessCoord(pipe=1, data=4, model=1): 41, ProcessCoord(pipe=1, data=5, model=0): 42, ProcessCoord(pipe=1, data=5, model=1): 43, ProcessCoord(pipe=1, data=6, model=0): 44, ProcessCoord(pipe=1, data=6, model=1): 45, ProcessCoord(pipe=1, data=7, model=0): 46, ProcessCoord(pipe=1, data=7, model=1): 47, ProcessCoord(pipe=1, data=8, model=0): 48, ProcessCoord(pipe=1, data=8, model=1): 49, ProcessCoord(pipe=1, data=9, model=0): 50, ProcessCoord(pipe=1, data=9, model=1): 51, ProcessCoord(pipe=1, data=10, model=0): 52, ProcessCoord(pipe=1, data=10, model=1): 53, ProcessCoord(pipe=1, data=11, model=0): 54, ProcessCoord(pipe=1, data=11, model=1): 55, ProcessCoord(pipe=1, data=12, model=0): 56, ProcessCoord(pipe=1, data=12, model=1): 57, ProcessCoord(pipe=1, data=13, model=0): 58, ProcessCoord(pipe=1, data=13, model=1): 59, ProcessCoord(pipe=1, data=14, model=0): 60, ProcessCoord(pipe=1, data=14, model=1): 61, ProcessCoord(pipe=1, data=15, model=0): 62, ProcessCoord(pipe=1, data=15, model=1): 63} +[default0]:[2022-10-07 12:25:29,128] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=15 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]:stage=1 layers=16 +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-07 12:25:29,721] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-07 12:25:29,722] [INFO] [utils.py:828:see_memory_usage] MA 1.04 GB Max_MA 1.04 GB CA 1.04 GB Max_CA 1 GB +[default0]:[2022-10-07 12:25:29,722] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.57 GB, percent = 17.4% +[default0]:setting training iterations to 3100 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-07 12:25:29,724] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-07 12:25:30,204] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default0]:[2022-10-07 12:25:30,205] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-07 12:25:30,205] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default0]:[2022-10-07 12:25:30,209] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2022-10-07 12:25:30,209] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[default0]:[2022-10-07 12:25:30,209] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer +[default0]:[2022-10-07 12:25:30,209] [INFO] [stage_1_and_2.py:134:__init__] Reduce bucket size 500000000 +[default0]:[2022-10-07 12:25:30,209] [INFO] [stage_1_and_2.py:135:__init__] Allgather bucket size 500000000 +[default0]:[2022-10-07 12:25:30,209] [INFO] [stage_1_and_2.py:136:__init__] CPU Offload: False +[default0]:[2022-10-07 12:25:30,209] [INFO] [stage_1_and_2.py:137:__init__] Round robin gradient partitioning: False +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Emitting ninja build file /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113/utils/build.ninja... +[default3]:Building extension module utils... +[default3]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default3]:ninja: no work to do. +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.6819522380828857 seconds +[default1]:Loading extension module utils... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7020831108093262 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6535861492156982 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7022998332977295 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.639254093170166 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.6896708011627197 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.6896495819091797 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.6902999877929688 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.6901099681854248 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6369357109069824 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6372239589691162 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6918933391571045 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6765129566192627 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7417089939117432 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.741549015045166 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6732971668243408 seconds +[default1]:Time to load utils op: 0.7096776962280273 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6769702434539795 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7254374027252197 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.672412633895874 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7254095077514648 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6874029636383057 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6759865283966064 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6764419078826904 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7151391506195068 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6872885227203369 seconds +[default0]:Loading extension module utils... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6856553554534912 seconds +[default0]:Time to load utils op: 0.6851463317871094 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7153899669647217 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7176198959350586 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.71807861328125 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6859776973724365 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7152070999145508 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.701610803604126 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6859257221221924 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.715078592300415 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6727883815765381 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7011668682098389 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6729810237884521 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.701286792755127 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6814613342285156 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7017176151275635 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6813151836395264 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.673353910446167 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7263422012329102 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7263240814208984 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6777496337890625 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6778242588043213 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6849002838134766 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6850175857543945 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7349951267242432 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7350075244903564 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6844174861907959 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7411162853240967 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.7111670970916748 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7407383918762207 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7113933563232422 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7177605628967285 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.7274458408355713 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.7274761199951172 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6992175579071045 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6876537799835205 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6826488971710205 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.699270486831665 seconds +[default1]:Rank: 33 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 61 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 39 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 51 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 43 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 35 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 53 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 55 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 60 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 52 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 6 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 62 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 63 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 12 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 14 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 7 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 18 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 17 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 21 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 28 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 31 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 36 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 30 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 20 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 38 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 37 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 59 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 58 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 46 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 47 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 44 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 9 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 10 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 8 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 3 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 1 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Rank: 27 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 25 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 26 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 0 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Rank: 49 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 50 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Rank: 48 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Rank: 42 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0014507770538330078 seconds +[default0]:Rank: 40 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0008859634399414062 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0008702278137207031 seconds +[default1]:Rank: 41 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0008182525634765625 seconds +[default2]:Rank: 34 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0011754035949707031 seconds +[default0]:Rank: 32 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0014204978942871094 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0008511543273925781 seconds +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0011761188507080078 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0019919872283935547 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0015490055084228516 seconds +[default2]:Rank: 54 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0021829605102539062 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0019228458404541016 seconds +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0010197162628173828 seconds +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Time to load utils op: 0.0019028186798095703 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.001287698745727539 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default1]:Time to load utils op: 0.0015954971313476562 seconds +[default3]:Time to load utils op: 0.0014064311981201172 seconds +[default0]:Rank: 4 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0012290477752685547 seconds +[default1]:Rank: 13 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Rank: 15 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0009992122650146484 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0011305809020996094 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default3]:Time to load utils op: 0.001300811767578125 seconds +[default0]:Time to load utils op: 0.0009534358978271484 seconds +[default1]:Rank: 5 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Rank: 16 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0009467601776123047 seconds +[default3]:Time to load utils op: 0.000989675521850586 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.001276254653930664 seconds +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.001154184341430664 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.000946044921875 seconds +[default3]:Rank: 19 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.001268148422241211 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0007715225219726562 seconds +[default1]:Rank: 29 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0009579658508300781 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0006361007690429688 seconds +[default2]:Rank: 22 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0005161762237548828 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0010151863098144531 seconds +[default3]:Rank: 23 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.001173257827758789 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0006639957427978516 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0005273818969726562 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0007927417755126953 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0009920597076416016 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Time to load utils op: 0.0009908676147460938 seconds +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0010275840759277344 seconds +[default1]:Rank: 57 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0009312629699707031 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default0]:Rank: 56 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Time to load utils op: 0.0012278556823730469 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0010249614715576172 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.001977682113647461 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0008873939514160156 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0010485649108886719 seconds +[default1]:Rank: 45 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0008275508880615234 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0008463859558105469 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0010538101196289062 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Rank: 11 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default3]:Time to load utils op: 0.0012288093566894531 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Time to load utils op: 0.0013375282287597656 seconds +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0010459423065185547 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0023179054260253906 seconds +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.002562999725341797 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.002589702606201172 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0022270679473876953 seconds +[default2]:Rank: 2 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default0]:Rank: 24 partition count [16, 16, 16] and sizes[(24969216, False), (9961472, False), (14848, False)] +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Time to load utils op: 0.0025320053100585938 seconds +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.002523183822631836 seconds +[default0]:[2022-10-07 12:25:31,973] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states +[default0]:[2022-10-07 12:25:31,974] [INFO] [utils.py:828:see_memory_usage] MA 1.17 GB Max_MA 1.19 GB CA 1.79 GB Max_CA 2 GB +[default0]:[2022-10-07 12:25:31,974] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.69 GB, percent = 18.5% +[default1]:Time to load utils op: 0.00241851806640625 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Time to load utils op: 0.001184701919555664 seconds +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0011646747589111328 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.001051187515258789 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0012493133544921875 seconds +[default0]:[2022-10-07 12:25:32,014] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states +[default0]:[2022-10-07 12:25:32,014] [INFO] [utils.py:828:see_memory_usage] MA 1.43 GB Max_MA 1.56 GB CA 2.14 GB Max_CA 2 GB +[default0]:[2022-10-07 12:25:32,015] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.78 GB, percent = 18.6% +[default0]:[2022-10-07 12:25:32,015] [INFO] [stage_1_and_2.py:516:__init__] optimizer state initialized +[default0]:[2022-10-07 12:25:32,048] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer +[default0]:[2022-10-07 12:25:32,049] [INFO] [utils.py:828:see_memory_usage] MA 1.43 GB Max_MA 1.43 GB CA 2.14 GB Max_CA 2 GB +[default0]:[2022-10-07 12:25:32,049] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.79 GB, percent = 18.6% +[default0]:[2022-10-07 12:25:32,049] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2022-10-07 12:25:32,049] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2022-10-07 12:25:32,049] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2022-10-07 12:25:32,050] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2022-10-07 12:25:32,050] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2022-10-07 12:25:32,050] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2022-10-07 12:25:32,050] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2022-10-07 12:25:32,050] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2022-10-07 12:25:32,050] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] gradient_accumulation_steps .. 128 +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2022-10-07 12:25:32,051] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] train_batch_size ............. 2048 +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 1 +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] world_size ................... 16 +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] zero_enabled ................. True +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:991:print] zero_optimization_stage ...... 1 +[default0]:[2022-10-07 12:25:32,052] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 1, +[default0]: "train_batch_size": 2.048000e+03, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 1 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.00048613548278808594 seconds +[default0]:[2022-10-07 12:25:32,053] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=128 micro_batch_size=1 +[default2]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,117] [INFO] [engine.py:145:__init__] RANK=32 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default0]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,117] [INFO] [engine.py:145:__init__] RANK=33 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default1]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,117] [INFO] [engine.py:145:__init__] RANK=1 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default1]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,116] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default0]:[2022-10-07 12:25:32,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:32,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:34,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:34,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:34,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:34,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:34,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:34,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:34,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:34,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:34,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:34,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default1]:[2022-10-07 12:25:34,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:34,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default0]:[2022-10-07 12:25:34,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:34,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-07 12:25:34,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:34,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-07 12:25:34,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:34,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default2]:[2022-10-07 12:25:34,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:34,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:34,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default3]:[2022-10-07 12:25:34,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-07 12:25:34,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default2]:[2022-10-07 12:25:34,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:34,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-07 12:25:34,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:34,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-07 12:25:34,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:34,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default1]:[2022-10-07 12:25:34,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:34,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default3]:[2022-10-07 12:25:34,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-07 12:25:34,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:34,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-07 12:25:34,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:34,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default2]:[2022-10-07 12:25:34,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:34,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:34,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:34,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-07 12:25:34,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:34,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default2]:[2022-10-07 12:25:34,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:34,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:34,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:34,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default0]:[2022-10-07 12:25:34,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:34,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default0]:[2022-10-07 12:25:34,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:34,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:34,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:34,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-07 12:25:34,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:34,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default1]:[2022-10-07 12:25:34,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:34,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default1]:[2022-10-07 12:25:34,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:34,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-07 12:25:34,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:34,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-07 12:25:34,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:34,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default3]:[2022-10-07 12:25:34,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:34,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-07 12:25:34,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:34,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-07 12:25:34,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:34,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-07 12:25:34,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:34,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:34,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:34,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-07 12:25:34,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:34,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default1]:[2022-10-07 12:25:35,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:35,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default0]:[2022-10-07 12:25:35,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:35,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:35,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:35,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default1]:[2022-10-07 12:25:35,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:35,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default0]:[2022-10-07 12:25:35,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:35,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:35,118] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:35,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-07 12:25:35,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:35,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:35,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:35,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-07 12:25:35,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:35,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default0]:[2022-10-07 12:25:35,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:35,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 12:25:35,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-07 12:25:35,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:35,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:35,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default2]:[2022-10-07 12:25:35,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:35,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 12:25:35,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:35,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:35,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:35,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default3]:[2022-10-07 12:25:35,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:35,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-07 12:25:35,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:35,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 12:25:35,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:35,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:35,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-07 12:25:35,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:35,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-07 12:25:35,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:35,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:35,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:35,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default0]:[2022-10-07 12:25:35,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:35,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default3]:[2022-10-07 12:25:35,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default3]:[2022-10-07 12:25:36,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:36,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default1]:[2022-10-07 12:25:36,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:36,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default2]:[2022-10-07 12:25:36,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:36,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:36,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-07 12:25:36,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 12:25:37,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 12:25:37,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:37,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:37,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:37,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:37,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:37,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 12:25:37,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:37,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 12:25:37,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:37,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 12:25:37,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:37,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 12:25:37,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:37,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:37,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:37,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:37,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:37,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:37,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:37,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:37,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:37,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default0]:[2022-10-07 12:25:37,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:37,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:37,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:37,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:37,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:37,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:37,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 12:25:38,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 12:25:38,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:37,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default2]:[2022-10-07 12:25:38,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:37,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 12:25:38,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 12:25:38,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 12:25:38,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 12:25:38,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 12:25:38,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:37,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 12:25:38,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 12:25:38,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 12:25:38,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:38,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 12:25:38,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 12:25:38,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:38,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 12:25:38,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:38,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:38,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:38,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default0]:[2022-10-07 12:25:38,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:38,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 12:25:38,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 12:25:38,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:38,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:38,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 12:25:38,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 12:25:38,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:38,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 12:25:38,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 12:25:38,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:38,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 12:25:38,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:38,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 12:25:38,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 12:25:38,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:38,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 12:25:38,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:38,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 12:25:38,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 12:25:38,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 12:25:38,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:38,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 12:25:38,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 12:25:38,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 12:25:38,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:38,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default2]:[2022-10-07 12:25:38,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:38,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:38,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:38,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:38,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:38,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:38,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:38,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:38,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:38,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:38,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:38,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:38,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:38,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:38,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:38,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:38,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:38,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 12:25:39,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:39,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 12:25:39,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 12:25:39,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 12:25:39,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 12:25:39,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 12:25:39,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 12:25:39,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:39,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 12:25:39,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:39,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:39,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 12:25:39,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:39,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 12:25:39,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:39,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:39,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:39,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:39,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 12:25:39,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 12:25:39,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:39,804] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 12:25:39,805] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:39,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,833] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:39,833] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 12:25:39,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:39,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,805] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 12:25:39,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,860] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/globa[default0]:[2022-10-07 12:25:39,806] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 12:25:39,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +l_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:39,861] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 12:25:39,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,859] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:39,860] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:39,833] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:39,834] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 12:25:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:39,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,913] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:39,913] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default2]:[2022-10-07 12:25:39,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 12:25:39,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,924] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:39,924] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default2]:[2022-10-07 12:25:39,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:39,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:39,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:39,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:39,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:39,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:39,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:39,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:39,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:39,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:39,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:39,913] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoint[default1]:[2022-10-07 12:25:39,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:39,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:39,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:39,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:39,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:39,924] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 12:25:39,925] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:39,910] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 12:25:39,910] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default2]:[2022-10-07 12:25:39,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +s/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 12:25:39,914] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:39,911] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:39,911] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default2]:[2022-10-07 12:25:40,010] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,010] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:40,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:40,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:40,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,011] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,011] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 12:25:40,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:40,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,158] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,158] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,132] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,132] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:40,130] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,131] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmi[default2]:[2022-10-07 12:25:40,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +xnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:40,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,157] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,158] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:40,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,247] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,247] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:40,235] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoint[default1]:[2022-10-07 12:25:40,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,227] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,228] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:40,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,247] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,247] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:40,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +s/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,236] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,229] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,229] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,235] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/globa[default1]:[2022-10-07 12:25:40,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +l_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,236] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:40,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default2]:[2022-10-07 12:25:40,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,451] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,452] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,459] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,459] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,494] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,495] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,443] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,443] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,439] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,440] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:40,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,442] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,442] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,451] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,451] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,457] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,458] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,493] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,494] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,440] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,441] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,516] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,516] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:40,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,517] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,517] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,649] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,649] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:40,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,632] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,633] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,631] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,631] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,648] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,648] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,751] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,751] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,764] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,765] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,787] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,788] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,786] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,787] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,752] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,753] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:could not find arguments in the checkpoint ... +[default0]: checkpoint version 3.0 +[default3]:[2022-10-07 12:25:40,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,763] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoint[default1]:[2022-10-07 12:25:40,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +s/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,765] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,810] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,810] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:40,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,809] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:40,809] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 12:25:40,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:40,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:40,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:40,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,913] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,913] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,912] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 12:25:40,912] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:40,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:40,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:40,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:40,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:40,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:40,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:40,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:40,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:40,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:40,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:40,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:41,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:41,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:41,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:41,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:41,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:41,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:41,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-07 12:25:41,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:41,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:41,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:41,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:41,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:41,114] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/globa[default3]:[2022-10-07 12:25:41,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:41,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +l_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 12:25:41,115] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:41,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:41,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:41,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:41,115] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:41,116] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:41,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:41,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 12:25:41,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-07 12:25:41,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:41,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,203] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:41,204] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default0]:[2022-10-07 12:25:41,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:41,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:41,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:41,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,204] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 12:25:41,205] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:41,242] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 12:25:41,242] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default2]:[2022-10-07 12:25:41,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 12:25:41,241] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 12:25:41,241] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default0]:[2022-10-07 12:25:41,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,299] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:41,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:41,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default0]:[2022-10-07 12:25:41,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,355] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 12:25:41,355] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:41,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,353] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 12:25:41,354] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-07 12:25:41,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:41,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:41,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default0]:[2022-10-07 12:25:41,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:41,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:41,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:41,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:41,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 12:25:41,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 12:25:41,663] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 12:25:41,663] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 12:25:41,664] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 12:25:41,664] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:41,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 12:25:41,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 12:25:41,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 12:25:41,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default3]:[2022-10-07 12:25:41,798] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:41,798] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 12:25:41,799] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 12:25:41,799] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default2]:[2022-10-07 12:25:41,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 12:25:41,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 12:25:41,861] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:41,861] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 12:25:41,859] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 12:25:41,860] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]: successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq at iteration 0 +[default3]:time (ms) | load-checkpoint: 9865.40 +[default0]:estimated model parameters: 2.236514304 +[default0]:estimated model parameters without embeddings: 1.208909824 +[default0]:[after model, optimizer, and learning rate scheduler are built] datetime: 2022-10-07 12:25:42 +[default0]:> building train, validation, and test datasets ... +[default0]: > datasets target sizes (minimum size): +[default0]: train: 6348800 +[default0]: validation: 512000 +[default0]: test: 20480 +[default0]:> building train, validation, and test datasets for T0 ... +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.053807 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]:/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/utils.py:365: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +[default0]: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +[default0]: train: +[default0]: document indices in [0, 29920425) total of 29920425 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012814 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004322 seconds +[default0]: number of documents: 31495184 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.067 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.019167 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4893782) total of 4893782 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014617 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003835 seconds +[default0]: number of documents: 5151349 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.069 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012014 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3384633) total of 3384633 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013652 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004133 seconds +[default0]: number of documents: 3562772 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.036 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.066131 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2572338) total of 2572338 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004359 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004138 seconds +[default0]: number of documents: 2707724 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.041 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012726 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4803145) total of 4803145 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.017470 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003801 seconds +[default0]: number of documents: 5055942 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.067 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013743 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2041507) total of 2041507 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012626 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003647 seconds +[default0]: number of documents: 2148955 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.017313 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2496022) total of 2496022 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011785 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003237 seconds +[default0]: number of documents: 2627392 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013191 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3382528) total of 3382528 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.021879 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004498 seconds +[default0]: number of documents: 3560556 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.038 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.016063 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1466269) total of 1466269 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013636 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003742 seconds +[default0]: number of documents: 1543441 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.071 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013205 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1583941) total of 1583941 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012560 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003803 seconds +[default0]: number of documents: 1667306 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.029 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013633 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 812968) total of 812968 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.017093 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003637 seconds +[default0]: number of documents: 855756 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.072 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011663 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 544696) total of 544696 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010167 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002705 seconds +[default0]: number of documents: 573364 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.015 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007856 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 390101) total of 390101 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006729 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001759 seconds +[default0]: number of documents: 410633 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.028 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008526 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 407401) total of 407401 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007771 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001871 seconds +[default0]: number of documents: 428843 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.014 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007255 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 396406) total of 396406 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007456 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002318 seconds +[default0]: number of documents: 417269 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.041 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014368 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1058732) total of 1058732 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044144 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004273 seconds +[default0]: number of documents: 1114455 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.014 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008440 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 330124) total of 330124 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008784 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001765 seconds +[default0]: number of documents: 347499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.010 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008307 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 322250) total of 322250 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005841 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001816 seconds +[default0]: number of documents: 339210 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.012 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007159 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 299966) total of 299966 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006006 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001729 seconds +[default0]: number of documents: 315754 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.010 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.024440 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 872495) total of 872495 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.019413 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003670 seconds +[default0]: number of documents: 918416 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.030 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.029322 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 902592) total of 902592 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004026 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003507 seconds +[default0]: number of documents: 950097 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.012 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032180 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869310) total of 869310 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012668 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003469 seconds +[default0]: number of documents: 915063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.029 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.025338 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869308) total of 869308 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012805 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004220 seconds +[default0]: number of documents: 915061 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.011 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044237 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869305) total of 869305 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012694 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003627 seconds +[default0]: number of documents: 915058 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.011 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.017153 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 821803) total of 821803 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012570 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003373 seconds +[default0]: number of documents: 865056 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.010 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032449 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869292) total of 869292 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.021937 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003738 seconds +[default0]: number of documents: 915044 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.012 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.027825 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869291) total of 869291 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006104 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003756 seconds +[default0]: number of documents: 915043 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.040 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.015216 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869270) total of 869270 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.026726 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003619 seconds +[default0]: number of documents: 915021 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.011 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.026097 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869301) total of 869301 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011752 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003905 seconds +[default0]: number of documents: 915054 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.010 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.022641 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869298) total of 869298 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.019110 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003824 seconds +[default0]: number of documents: 915051 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.011 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006559 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 302280) total of 302280 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006569 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001887 seconds +[default0]: number of documents: 318189 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008180 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 252571) total of 252571 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005173 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001410 seconds +[default0]: number of documents: 265864 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005540 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004670 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001218 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007969 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005063 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001160 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006101 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005147 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001881 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006485 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346807) total of 346807 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005972 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001877 seconds +[default0]: number of documents: 365060 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006017 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346810) total of 346810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006544 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001674 seconds +[default0]: number of documents: 365063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005839 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005586 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001767 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007662 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006457 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001752 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005666 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006371 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001447 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005915 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014987 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001231 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004858 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 257631) total of 257631 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005929 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001395 seconds +[default0]: number of documents: 271191 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006495 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 256474) total of 256474 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006160 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001020 seconds +[default0]: number of documents: 269973 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005116 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005774 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001285 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.025 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006567 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005451 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001592 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006409 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005809 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001631 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387164 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786753 +[default0]: dataset 2, input: 0.0636898, achieved: 0.0636898 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584986 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576332 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485994 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478619 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476917 +[default0]: dataset 8, input: 0.045653, achieved: 0.045653 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322254 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199319 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138497 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960602 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865244 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692258 +[default0]: dataset 15, input: 0.00582803, achieved: 0.00582806 +[default0]: dataset 16, input: 0.00582586, achieved: 0.00582586 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543682 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409057 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366564 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337937 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282753 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00274012 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264622 +[default0]: dataset 24, input: 0.00262358, achieved: 0.0026236 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00260032 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259097 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245155 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244736 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238686 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200525 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181879 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171917 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167776 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162355 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131238 +[default0]: dataset 36, input: 0.00127347, achieved: 0.00127344 +[default0]: dataset 37, input: 0.00120564, achieved: 0.00120569 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119529 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118536 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117487 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00114929 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112334 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112315 +[default0]: dataset 44, input: 0.00111237, achieved: 0.00111236 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110444 +[default0]:> elapsed time for building blendable dataset indices: 0.93 (sec) +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.056978 seconds +[default0]: number of documents: 15234080 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [14472376, 15234080) total of 761704 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.075 seconds +[default0]: total number of samples: 221750 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.053933 seconds +[default0]: number of documents: 6142390 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [5835270, 6142390) total of 307120 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.085 seconds +[default0]: total number of samples: 136143 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.056630 seconds +[default0]: number of documents: 26176998 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [24868148, 26176998) total of 1308850 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.115 seconds +[default0]: total number of samples: 432311 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.069141 seconds +[default0]: number of documents: 20844665 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [19802432, 20844665) total of 1042233 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.092 seconds +[default0]: total number of samples: 521545 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.056562 seconds +[default0]: number of documents: 67005817 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [63655526, 67005817) total of 3350291 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.136 seconds +[default0]: total number of samples: 1740321 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.057870 seconds +[default0]: number of documents: 5149795 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4892305, 5149795) total of 257490 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.046 seconds +[default0]: total number of samples: 26370 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.058233 seconds +[default0]: number of documents: 58847091 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [55904736, 58847091) total of 2942355 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.134 seconds +[default0]: total number of samples: 1458654 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.059632 seconds +[default0]: number of documents: 12514253 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [11888540, 12514253) total of 625713 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.086 seconds +[default0]: total number of samples: 134071 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.031870 seconds +[default0]: number of documents: 180608 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [171578, 180608) total of 9030 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: total number of samples: 2501 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049287 seconds +[default0]: number of documents: 12303134 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [11687977, 12303134) total of 615157 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.046 seconds +[default0]: total number of samples: 157244 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.054660 seconds +[default0]: number of documents: 2033057 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [1931404, 2033057) total of 101653 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.075 seconds +[default0]: total number of samples: 20517 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045789 seconds +[default0]: number of documents: 26793553 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [25453875, 26793553) total of 1339678 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.124 seconds +[default0]: total number of samples: 101502 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.061677 seconds +[default0]: number of documents: 3155990 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2998190, 3155990) total of 157800 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.042 seconds +[default0]: total number of samples: 44182 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.054575 seconds +[default0]: number of documents: 6692522 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [6357896, 6692522) total of 334626 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.057 seconds +[default0]: total number of samples: 47613 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.061584 seconds +[default0]: number of documents: 3017261 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2866398, 3017261) total of 150863 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: total number of samples: 29298 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.060785 seconds +[default0]: number of documents: 3648041 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [3465639, 3648041) total of 182402 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.008 seconds +[default0]: total number of samples: 5659 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.054039 seconds +[default0]: number of documents: 4327282 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4110918, 4327282) total of 216364 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: total number of samples: 12423 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.065226 seconds +[default0]: number of documents: 2698896 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2563951, 2698896) total of 134945 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.026 seconds +[default0]: total number of samples: 19133 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.060547 seconds +[default0]: number of documents: 12767593 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [12129213, 12767593) total of 638380 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.050 seconds +[default0]: total number of samples: 87928 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.061740 seconds +[default0]: number of documents: 4342323 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4125207, 4342323) total of 217116 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.045 seconds +[default0]: total number of samples: 69780 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.058736 seconds +[default0]: number of documents: 3022722 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2871586, 3022722) total of 151136 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.008 seconds +[default0]: total number of samples: 22532 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.059977 seconds +[default0]: number of documents: 1162568 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [1104440, 1162568) total of 58128 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: total number of samples: 1608 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.096544 seconds +[default0]: number of documents: 55294645 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [52529913, 55294645) total of 2764732 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.109 seconds +[default0]: total number of samples: 690621 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.050178 seconds +[default0]: number of documents: 44855616 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [42612835, 44855616) total of 2242781 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.098 seconds +[default0]: total number of samples: 468689 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.055873 seconds +[default0]: number of documents: 31969891 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [30371396, 31969891) total of 1598495 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.118 seconds +[default0]: total number of samples: 497625 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.059788 seconds +[default0]: number of documents: 34110375 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [32404856, 34110375) total of 1705519 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.074 seconds +[default0]: total number of samples: 125120 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.079862 seconds +[default0]: number of documents: 43761623 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [41573542, 43761623) total of 2188081 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.124 seconds +[default0]: total number of samples: 1010592 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.030556 seconds +[default0]: number of documents: 197602 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [187722, 197602) total of 9880 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: total number of samples: 4451 +[default0]: total number of epochs: 1 +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.0330676, achieved: 0.0330676 +[default0]: dataset 1, input: 0.0112421, achieved: 0.0112421 +[default0]: dataset 2, input: 0.130272, achieved: 0.130272 +[default0]: dataset 3, input: 0.221712, achieved: 0.221712 +[default0]: dataset 4, input: 0.106678, achieved: 0.106678 +[default0]: dataset 5, input: 0.00155951, achieved: 0.00155955 +[default0]: dataset 6, input: 0.13054, achieved: 0.13054 +[default0]: dataset 7, input: 0.010918, achieved: 0.0109181 +[default0]: dataset 8, input: 0.000110214, achieved: 0.000110257 +[default0]: dataset 9, input: 0.00549238, achieved: 0.00549235 +[default0]: dataset 10, input: 0.000402122, achieved: 0.000402094 +[default0]: dataset 11, input: 0.00747007, achieved: 0.00747007 +[default0]: dataset 12, input: 0.000619047, achieved: 0.000619024 +[default0]: dataset 13, input: 0.00103353, achieved: 0.0010336 +[default0]: dataset 14, input: 0.000501201, achieved: 0.000501226 +[default0]: dataset 15, input: 0.000667277, achieved: 0.000667231 +[default0]: dataset 16, input: 0.000359281, achieved: 0.000359326 +[default0]: dataset 17, input: 0.000508443, achieved: 0.000508519 +[default0]: dataset 18, input: 0.00211373, achieved: 0.0021138 +[default0]: dataset 19, input: 0.000912995, achieved: 0.000912961 +[default0]: dataset 20, input: 0.00124543, achieved: 0.00124546 +[default0]: dataset 21, input: 0.000315887, achieved: 0.00031594 +[default0]: dataset 22, input: 0.0813721, achieved: 0.0813721 +[default0]: dataset 23, input: 0.0552939, achieved: 0.0552939 +[default0]: dataset 24, input: 0.0495415, achieved: 0.0495414 +[default0]: dataset 25, input: 0.0246164, achieved: 0.0246163 +[default0]: dataset 26, input: 0.120917, achieved: 0.120917 +[default0]: dataset 27, input: 0.000517703, achieved: 0.000517666 +[default0]:> elapsed time for building blendable dataset indices: 0.46 (sec) +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004692 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [29920425, 31495184) total of 1574759 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004089 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003383 seconds +[default0]: number of documents: 31495184 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_199220ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_199220ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.079 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003826 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4893782, 5151349) total of 257567 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003506 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003306 seconds +[default0]: number of documents: 5151349 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_40484ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_40484ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.054 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003399 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3384633, 3562772) total of 178139 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.016040 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003400 seconds +[default0]: number of documents: 3562772 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_32773ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_32773ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.064 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004015 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2572338, 2707724) total of 135386 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003484 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003715 seconds +[default0]: number of documents: 2707724 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_30102ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_30102ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.039 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.015790 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4803145, 5055942) total of 252797 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.016411 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003730 seconds +[default0]: number of documents: 5055942 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_29656ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_29656ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.046 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004120 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2041507, 2148955) total of 107448 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003349 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003387 seconds +[default0]: number of documents: 2148955 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_25008ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_25008ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.067 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004075 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2496022, 2627392) total of 131370 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010676 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003239 seconds +[default0]: number of documents: 2627392 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_24628ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_24628ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.034 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013022 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3382528, 3560556) total of 178028 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011269 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003437 seconds +[default0]: number of documents: 3560556 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_24541ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_24541ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.046 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004162 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1466269, 1543441) total of 77172 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003637 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003289 seconds +[default0]: number of documents: 1543441 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_23492ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_23492ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.018 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003987 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1583941, 1667306) total of 83365 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011900 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003550 seconds +[default0]: number of documents: 1667306 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_16582ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_16582ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.053 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003930 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [812968, 855756) total of 42788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004069 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003541 seconds +[default0]: number of documents: 855756 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_10257ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_10257ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.038 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007966 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [544696, 573364) total of 28668 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008752 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002506 seconds +[default0]: number of documents: 573364 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_7127ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_7127ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001974 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [390101, 410633) total of 20532 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007289 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001701 seconds +[default0]: number of documents: 410633 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_4943ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_4943ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.032 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007314 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [407401, 428843) total of 21442 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007319 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001970 seconds +[default0]: number of documents: 428843 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_4453ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_4453ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.026 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002616 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [396406, 417269) total of 20863 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005929 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001749 seconds +[default0]: number of documents: 417269 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_3563ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_3563ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004204 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1058732, 1114455) total of 55723 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013822 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003591 seconds +[default0]: number of documents: 1114455 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_2999ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_2999ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.029 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002229 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [330124, 347499) total of 17375 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001704 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001226 seconds +[default0]: number of documents: 347499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_2998ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_2998ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.034 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001993 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [322250, 339210) total of 16960 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006429 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001520 seconds +[default0]: number of documents: 339210 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_2798ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_2798ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001894 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [299966, 315754) total of 15788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005890 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001233 seconds +[default0]: number of documents: 315754 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_2105ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_2105ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.025 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004107 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [872495, 918416) total of 45921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014256 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003549 seconds +[default0]: number of documents: 918416 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_1887ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_1887ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.048 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004362 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [902592, 950097) total of 47505 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.017377 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003714 seconds +[default0]: number of documents: 950097 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_1739ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_1739ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.018 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004614 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869310, 915063) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011640 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003856 seconds +[default0]: number of documents: 915063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_1455ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_1455ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012120 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869308, 915061) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003804 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003366 seconds +[default0]: number of documents: 915061 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_1410ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_1410ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.017 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003689 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869305, 915058) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003094 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003061 seconds +[default0]: number of documents: 915058 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_1362ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_1362ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013027 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [821803, 865056) total of 43253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003894 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003648 seconds +[default0]: number of documents: 865056 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_1350ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_1350ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003690 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869292, 915044) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003437 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003189 seconds +[default0]: number of documents: 915044 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_1339ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_1339ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003788 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869291, 915043) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003837 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003141 seconds +[default0]: number of documents: 915043 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_1334ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_1334ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011479 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869270, 915021) total of 45751 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003878 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003439 seconds +[default0]: number of documents: 915021 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_1262ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_1262ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.015 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013075 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869301, 915054) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011857 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004256 seconds +[default0]: number of documents: 915054 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_1260ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_1260ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.024 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004372 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869298, 915051) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003927 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003253 seconds +[default0]: number of documents: 915051 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_1229ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_1229ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.022 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006065 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [302280, 318189) total of 15909 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005129 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001429 seconds +[default0]: number of documents: 318189 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_1032ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_1032ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001586 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [252571, 265864) total of 13293 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003890 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001024 seconds +[default0]: number of documents: 265864 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_936ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_936ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005569 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001236 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001259 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_885ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_885ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006556 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004989 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001317 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_864ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_864ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004879 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004323 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001292 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_836ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_836ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002126 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346807, 365060) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006201 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001631 seconds +[default0]: number of documents: 365060 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_676ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_676ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007101 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346810, 365063) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001829 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001672 seconds +[default0]: number of documents: 365063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_656ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_656ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004704 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004572 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001318 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_621ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_621ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001516 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003703 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001114 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005175 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004306 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001284 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_610ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_610ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005472 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006193 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001327 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_605ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_605ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004043 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [257631, 271191) total of 13560 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005995 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001255 seconds +[default0]: number of documents: 271191 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_592ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_592ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004723 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [256474, 269973) total of 13499 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004180 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001272 seconds +[default0]: number of documents: 269973 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_579ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_579ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005311 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004150 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001248 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_578ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_578ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005310 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004963 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001292 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_573ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_573ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001853 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005108 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001015 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_569ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_569ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387163 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786745 +[default0]: dataset 2, input: 0.0636898, achieved: 0.0636904 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584976 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576328 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485991 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478619 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476916 +[default0]: dataset 8, input: 0.045653, achieved: 0.0456537 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322257 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199317 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138502 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960574 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865232 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692254 +[default0]: dataset 15, input: 0.00582803, achieved: 0.0058278 +[default0]: dataset 16, input: 0.00582586, achieved: 0.0058261 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543622 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409121 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366557 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337955 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282792 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00273939 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264575 +[default0]: dataset 24, input: 0.00262358, achieved: 0.00262362 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00259978 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259127 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245166 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244826 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238696 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200559 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181831 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171957 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167871 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162423 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131266 +[default0]: dataset 36, input: 0.00127347, achieved: 0.0012735 +[default0]: dataset 37, input: 0.00120564, achieved: 0.0012054 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119518 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118497 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117475 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00114922 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112368 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112368 +[default0]: dataset 44, input: 0.00111237, achieved: 0.00111176 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110495 +[default0]:> elapsed time for building blendable dataset indices: 0.05 (sec) +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:> finished creating T0 datasets ... +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default0]:[000-001] 2.2365B / 1.2089B +[default1]:[001-001] 2.2365B / 1.2089B +[default1]:[001-000] 2.2365B / 1.2089B +[default0]:[after dataloaders are built] datetime: 2022-10-07 12:25:54 +[default0]:done with setup ... +[default0]:training ... +[default0]:Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +[default0]:[000-000] 2.2365B / 1.2089B +[default0]:[before the start of training step] datetime: 2022-10-07 12:25:54 +[default3]:time (ms) | model-and-optimizer-setup: 13966.70 | train/valid/test-data-iterators-setup: 12047.06 +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 247486 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 247488 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222253 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222255 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 221118 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 221120 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222079 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222081 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222733 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222735 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222448 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222450 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 239865 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 239867 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 231631 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 231633 closing signal SIGTERM +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 239866) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 221119) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 222080) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 222449) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 222734) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 247487) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 222254) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 231632) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 301.23309230804443 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_12:25:54 + host : r6i4n7-ib0 + rank : 15 (local_rank: 3) + exitcode : 1 (pid: 222736) + error_file: /tmp/torchelastic_c6duoauk/none_b08t4el4/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_12:25:54 + host : r6i4n7-ib0 + rank : 13 (local_rank: 1) + exitcode : 1 (pid: 222734) + error_file: /tmp/torchelastic_c6duoauk/none_b08t4el4/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +============================================================ +srun: error: r6i4n7: task 3: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2095810.0 +slurmstepd: error: *** STEP 2095810.0 ON r6i4n4 CANCELLED AT 2022-10-07T12:31:00 *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2918399 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 623627 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 220116 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2918400 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3372562 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 623628 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3617671 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 204912 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 623629 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2918401 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3372563 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 174468 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 220117 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3617672 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 204913 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 623630 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2918402 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3617673 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3372564 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 174469 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 220118 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 204914 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3617674 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3372565 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 174470 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 220119 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 204915 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 174471 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222422 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222423 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222424 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 222425 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + self._exit_barrier() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 221090 got signal: 15 +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") + agent_data = get_all(store, rank, key_prefix, world_size) +RuntimeError: Socket Timeout + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + return _run_code(code, main_globals, None, + data = store.get(f"{prefix}{idx}") + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code +RuntimeError: Socket Timeout + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + return _run_code(code, main_globals, None, + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + main() + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + run(args) + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + return launch_agent(self._config, self._entrypoint, list(args)) + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + data = store.get(f"{prefix}{idx}") + result = agent.run() +RuntimeError: Socket Timeout + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + return launch_agent(self._config, self._entrypoint, list(args)) + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + elastic_launch( + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + self._exit_barrier() + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + store_util.barrier( + self._exit_barrier() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 247458 got signal: 15 + store_util.barrier( + self._exit_barrier() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 231603 got signal: 15 + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 222051 got signal: 15 + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + self._exit_barrier() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 239836 got signal: 15 +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout + +During handling of the above exception, another exception occurred: + + data = store.get(f"{prefix}{idx}") +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +RuntimeError: Socket Timeout + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + self._exit_barrier() + self._exit_barrier() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 222225 got signal: 15 + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 222420 got signal: 15 +srun: error: r6i4n4: task 0: Exited with exit code 1 +srun: error: r6i5n1: task 6: Exited with exit code 1 +srun: error: r6i4n5: task 1: Exited with exit code 1 +srun: error: r6i5n0: task 5: Exited with exit code 1 +srun: error: r6i4n6: task 2: Exited with exit code 1 +srun: error: r6i5n2: task 7: Exited with exit code 1 +srun: error: r6i4n8: task 4: Exited with exit code 1 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 204883 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3372534 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 2918370 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3617642 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 623599 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 174440 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 222394 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 220088 got signal: 15 +srun: error: r9i5n6: task 15: Exited with exit code 1 +srun: error: r9i0n5: task 8: Exited with exit code 1 +srun: error: r9i0n6: task 9: Exited with exit code 1 +srun: error: r9i0n7: task 10: Exited with exit code 1 +srun: error: r9i5n5: task 14: Exited with exit code 1 +srun: error: r9i5n4: task 13: Exited with exit code 1 +srun: error: r9i5n2: task 11: Exited with exit code 1 +srun: error: r9i5n3: task 12: Exited with exit code 1 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:using world size: 32, data-parallel-size: 8, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 8 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2114997.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 2 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 8192 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 2048 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 128 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 32 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 256 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-07 22:51:11,631] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default7]:> setting tensorboard ... +[default0]:> initializing tensor model parallel with size 2 +[default0]:> initializing pipeline model parallel with size 2 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-07 22:51:13,097] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.048 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 6.023 seconds +[default0]:time to initialize megatron (seconds): 71.180 +[default0]:[after megatron is initialized] datetime: 2022-10-07 22:51:19 +[default0]:building GPT model ... +[default0]:[2022-10-07 22:51:19,230] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-07 22:51:19,231] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-07 22:51:19,231] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.98 GB, percent = 6.6% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=1, model=0): 2, ProcessCoord(pipe=0, data=1, model=1): 3, ProcessCoord(pipe=0, data=2, model=0): 4, ProcessCoord(pipe=0, data=2, model=1): 5, ProcessCoord(pipe=0, data=3, model=0): 6, ProcessCoord(pipe=0, data=3, model=1): 7, ProcessCoord(pipe=0, data=4, model=0): 8, ProcessCoord(pipe=0, data=4, model=1): 9, ProcessCoord(pipe=0, data=5, model=0): 10, ProcessCoord(pipe=0, data=5, model=1): 11, ProcessCoord(pipe=0, data=6, model=0): 12, ProcessCoord(pipe=0, data=6, model=1): 13, ProcessCoord(pipe=0, data=7, model=0): 14, ProcessCoord(pipe=0, data=7, model=1): 15, ProcessCoord(pipe=1, data=0, model=0): 16, ProcessCoord(pipe=1, data=0, model=1): 17, ProcessCoord(pipe=1, data=1, model=0): 18, ProcessCoord(pipe=1, data=1, model=1): 19, ProcessCoord(pipe=1, data=2, model=0): 20, ProcessCoord(pipe=1, data=2, model=1): 21, ProcessCoord(pipe=1, data=3, model=0): 22, ProcessCoord(pipe=1, data=3, model=1): 23, ProcessCoord(pipe=1, data=4, model=0): 24, ProcessCoord(pipe=1, data=4, model=1): 25, ProcessCoord(pipe=1, data=5, model=0): 26, ProcessCoord(pipe=1, data=5, model=1): 27, ProcessCoord(pipe=1, data=6, model=0): 28, ProcessCoord(pipe=1, data=6, model=1): 29, ProcessCoord(pipe=1, data=7, model=0): 30, ProcessCoord(pipe=1, data=7, model=1): 31} +[default0]:[2022-10-07 22:51:19,675] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=15 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]:stage=1 layers=16 +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-07 22:51:20,304] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-07 22:51:20,304] [INFO] [utils.py:828:see_memory_usage] MA 1.04 GB Max_MA 1.04 GB CA 1.04 GB Max_CA 1 GB +[default0]:[2022-10-07 22:51:20,305] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.41 GB, percent = 6.6% +[default0]:setting training iterations to 3100 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-07 22:51:20,306] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-07 22:51:21,362] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default0]:[2022-10-07 22:51:21,362] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default0]:[2022-10-07 22:51:21,362] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default0]:[2022-10-07 22:51:21,365] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2022-10-07 22:51:21,365] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[default0]:[2022-10-07 22:51:21,365] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer +[default0]:[2022-10-07 22:51:21,365] [INFO] [stage_1_and_2.py:134:__init__] Reduce bucket size 500000000 +[default0]:[2022-10-07 22:51:21,365] [INFO] [stage_1_and_2.py:135:__init__] Allgather bucket size 500000000 +[default0]:[2022-10-07 22:51:21,365] [INFO] [stage_1_and_2.py:136:__init__] CPU Offload: False +[default0]:[2022-10-07 22:51:21,365] [INFO] [stage_1_and_2.py:137:__init__] Round robin gradient partitioning: False +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Emitting ninja build file /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113/utils/build.ninja... +[default4]:Building extension module utils... +[default4]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default5]:Loading extension module utils... +[default3]:Loading extension module utils... +[default0]:Loading extension module utils... +[default7]:Loading extension module utils... +[default4]:Loading extension module utils... +[default2]:Loading extension module utils... +[default6]:Loading extension module utils... +[default1]:Loading extension module utils... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.6634068489074707 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6956143379211426 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.6572694778442383 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.6955108642578125 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.6615052223205566 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.6949939727783203 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6967802047729492 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.662064790725708 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.632749080657959 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6539936065673828 seconds +[default1]:Loading extension module utils... +[default4]:ninja: no work to do. +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.6376211643218994 seconds +[default1]:Time to load utils op: 0.6218938827514648 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.637559175491333 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.6221449375152588 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6375787258148193 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.6218862533569336 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.6375739574432373 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.6223678588867188 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.654512882232666 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.6534581184387207 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.6303722858428955 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.6321818828582764 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.6314120292663574 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.6528308391571045 seconds +[default5]:Time to load utils op: 0.6693360805511475 seconds +[default3]:Time to load utils op: 0.6723008155822754 seconds +[default0]:Time to load utils op: 0.7047781944274902 seconds +[default7]:Time to load utils op: 0.6704850196838379 seconds +[default4]:Time to load utils op: 0.7046725749969482 seconds +[default2]:Time to load utils op: 0.7048468589782715 seconds +[default6]:Time to load utils op: 0.7048649787902832 seconds +[default1]:Time to load utils op: 0.6715102195739746 seconds +[default6]:Rank: 22 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default7]:Rank: 23 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default4]:Rank: 4 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default5]:Rank: 5 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default1]:Rank: 9 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default0]:Rank: 8 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default7]:Rank: 7 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default6]:Rank: 6 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default0]:Rank: 24 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default1]:Rank: 25 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default2]:Rank: 18 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default3]:Rank: 19 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default2]:Rank: 10 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default3]:Rank: 11 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default5]:Rank: 29 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default4]:Rank: 28 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default3]:Rank: 27 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default2]:Rank: 26 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default1]:Rank: 17 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default0]:Rank: 16 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default7]:Rank: 31 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default6]:Rank: 30 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default4]:Rank: 20 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default5]:Rank: 21 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default5]:Rank: 13 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default4]:Rank: 12 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default1]:Rank: 1 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default0]:Rank: 0 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default6]:Rank: 14 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default7]:Rank: 15 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0006053447723388672 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0006320476531982422 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0005922317504882812 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0008008480072021484 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0004851818084716797 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0007779598236083984 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0004553794860839844 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0004923343658447266 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0007898807525634766 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0005517005920410156 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0005733966827392578 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0005385875701904297 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.000537872314453125 seconds +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0005145072937011719 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0004792213439941406 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0005443096160888672 seconds +[default7]:Time to load utils op: 0.00049591064453125 seconds +[default2]:Rank: 2 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0011925697326660156 seconds +[default3]:Rank: 3 partition count [8, 8, 8] and sizes[(49938432, False), (19922944, False), (29696, False)] +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Time to load utils op: 0.002013683319091797 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0007350444793701172 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0019693374633789062 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0007846355438232422 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0022194385528564453 seconds +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0010211467742919922 seconds +[default1]:Time to load utils op: 0.0005018711090087891 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.00046062469482421875 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Time to load utils op: 0.0009019374847412109 seconds +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0007305145263671875 seconds +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0005068778991699219 seconds +[default4]:Time to load utils op: 0.0005707740783691406 seconds +[default0]:Time to load utils op: 0.0008761882781982422 seconds +[default0]:[2022-10-07 22:51:26,359] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states +[default0]:[2022-10-07 22:51:26,359] [INFO] [utils.py:828:see_memory_usage] MA 1.3 GB Max_MA 1.34 GB CA 2.08 GB Max_CA 2 GB +[default0]:[2022-10-07 22:51:26,359] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.21 GB, percent = 7.4% +[default0]:[2022-10-07 22:51:26,426] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states +[default0]:[2022-10-07 22:51:26,426] [INFO] [utils.py:828:see_memory_usage] MA 1.82 GB Max_MA 2.08 GB CA 2.72 GB Max_CA 3 GB +[default0]:[2022-10-07 22:51:26,426] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.21 GB, percent = 7.4% +[default0]:[2022-10-07 22:51:26,426] [INFO] [stage_1_and_2.py:516:__init__] optimizer state initialized +[default0]:[2022-10-07 22:51:26,455] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer +[default0]:[2022-10-07 22:51:26,455] [INFO] [utils.py:828:see_memory_usage] MA 1.82 GB Max_MA 1.82 GB CA 2.72 GB Max_CA 3 GB +[default0]:[2022-10-07 22:51:26,455] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.21 GB, percent = 7.4% +[default0]:[2022-10-07 22:51:26,455] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2022-10-07 22:51:26,455] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2022-10-07 22:51:26,455] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2022-10-07 22:51:26,456] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2022-10-07 22:51:26,456] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] gradient_accumulation_steps .. 256 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] train_batch_size ............. 2048 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 1 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] world_size ................... 8 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] zero_enabled ................. True +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:991:print] zero_optimization_stage ...... 1 +[default0]:[2022-10-07 22:51:26,457] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 1, +[default0]: "train_batch_size": 2.048000e+03, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 1 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.00045680999755859375 seconds +[default0]:[2022-10-07 22:51:26,458] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=256 micro_batch_size=1 +[default1]:[2022-10-07 22:51:26,900] [INFO] [engine.py:145:__init__] RANK=17 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default0]:[2022-10-07 22:51:26,900] [INFO] [engine.py:145:__init__] RANK=16 STAGE=1 LAYERS=16 [15, 31) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default1]:[2022-10-07 22:51:26,900] [INFO] [engine.py:145:__init__] RANK=1 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default0]:[2022-10-07 22:51:26,900] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=559128576 (559.129M) TOTAL_PARAMS=2236514304 (2236.514M) UNIQUE_PARAMS=1722712064 (1722.712M) +[default3]:[2022-10-07 22:51:27,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 22:51:27,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 22:51:27,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 22:51:27,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 22:51:27,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 22:51:27,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 22:51:27,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 22:51:27,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 22:51:27,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 22:51:27,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 22:51:27,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 22:51:27,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 22:51:27,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 22:51:27,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 22:51:27,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 22:51:27,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-07 22:51:27,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 22:51:28,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 22:51:28,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-07 22:51:28,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 22:51:28,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default7]:[2022-10-07 22:51:28,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 22:51:28,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default5]:[2022-10-07 22:51:28,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-07 22:51:28,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default1]:[2022-10-07 22:51:28,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 22:51:28,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default6]:[2022-10-07 22:51:28,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 22:51:28,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default4]:[2022-10-07 22:51:28,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 22:51:28,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default6]:[2022-10-07 22:51:28,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 22:51:28,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default2]:[2022-10-07 22:51:28,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 22:51:28,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 22:51:28,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 22:51:28,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default5]:[2022-10-07 22:51:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-07 22:51:28,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default7]:[2022-10-07 22:51:28,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 22:51:28,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-07 22:51:29,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 22:51:29,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-07 22:51:29,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default4]:[2022-10-07 22:51:29,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 22:51:29,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-07 22:51:29,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-07 22:51:29,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default6]:[2022-10-07 22:51:29,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 22:51:29,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 22:51:29,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 22:51:29,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 22:51:29,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default1]:[2022-10-07 22:51:29,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 22:51:29,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 22:51:29,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 22:51:29,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default3]:[2022-10-07 22:51:29,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default4]:[2022-10-07 22:51:29,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 22:51:29,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-07 22:51:29,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-07 22:51:29,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 22:51:29,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default2]:[2022-10-07 22:51:29,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 22:51:29,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default0]:[2022-10-07 22:51:29,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 22:51:29,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default6]:[2022-10-07 22:51:29,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 22:51:29,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-07 22:51:29,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 22:51:29,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-07 22:51:29,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 22:51:29,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt... +[default5]:[2022-10-07 22:51:29,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-07 22:51:29,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default7]:[2022-10-07 22:51:30,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 22:51:30,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 22:51:30,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default3]:[2022-10-07 22:51:30,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-07 22:51:30,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt... +[default7]:[2022-10-07 22:51:30,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt... +[default3]:[2022-10-07 22:51:30,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 22:51:30,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:30,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default1]:[2022-10-07 22:51:30,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:30,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-07 22:51:30,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:30,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-07 22:51:30,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:30,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:30,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:30,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:30,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:30,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:30,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:30,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:30,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:31,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 22:51:31,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:31,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:31,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:31,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:31,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:31,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:31,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:31,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:31,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:31,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:31,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default7]:[2022-10-07 22:51:31,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default5]:[2022-10-07 22:51:31,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:31,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:31,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:31,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:31,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:31,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:31,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:31,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:31,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:31,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default0]:[2022-10-07 22:51:31,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:31,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:31,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:31,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:31,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:31,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:31,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:31,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-07 22:51:31,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-07 22:51:31,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default4]:[2022-10-07 22:51:31,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default4]:[2022-10-07 22:51:31,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:31,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default6]:[2022-10-07 22:51:31,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:31,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:31,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:31,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:31,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:31,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:31,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:31,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:31,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default6]:[2022-10-07 22:51:31,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default2]:[2022-10-07 22:51:31,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 22:51:31,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:31,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:31,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:31,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:31,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:31,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:31,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:31,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default6]:[2022-10-07 22:51:31,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:31,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:31,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:31,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:31,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:31,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:31,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:31,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:31,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:31,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:31,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 22:51:31,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:31,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:31,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:31,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:31,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:31,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:31,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:31,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:31,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:31,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default0]:[2022-10-07 22:51:31,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:31,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:31,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:31,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:31,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:31,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:31,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default6]:[2022-10-07 22:51:31,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:31,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 22:51:31,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:31,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default4]:[2022-10-07 22:51:31,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:31,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:31,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:31,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:31,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:31,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:31,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:31,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:31,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:31,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:31,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:31,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default0]:[2022-10-07 22:51:31,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:31,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default0]:[2022-10-07 22:51:31,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:31,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:31,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default0]:[2022-10-07 22:51:32,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:31,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:31,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:31,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,118] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:32,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-07 22:51:32,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 22:51:32,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 22:51:32,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-07 22:51:32,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 22:51:32,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 22:51:32,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default2]:[2022-10-07 22:51:32,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:32,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default2]:[2022-10-07 22:51:32,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-07 22:51:32,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default3]:[2022-10-07 22:51:32,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default6]:[2022-10-07 22:51:32,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default0]:[2022-10-07 22:51:32,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default0]:[2022-10-07 22:51:32,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 22:51:32,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,299] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default0]:[2022-10-07 22:51:32,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-07 22:51:32,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:32,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default0]:[2022-10-07 22:51:32,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_02_model_states.pt. +[default0]:[2022-10-07 22:51:32,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default2]:[2022-10-07 22:51:32,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default0]:[2022-10-07 22:51:32,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default6]:[2022-10-07 22:51:32,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default2]:[2022-10-07 22:51:32,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default2]:[2022-10-07 22:51:32,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default0]:[2022-10-07 22:51:32,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default0]:[2022-10-07 22:51:32,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:32,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default2]:[2022-10-07 22:51:32,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,619] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 22:51:32,619] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 22:51:32,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default2]:[2022-10-07 22:51:32,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default0]:[2022-10-07 22:51:32,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-07 22:51:32,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default0]:[2022-10-07 22:51:32,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,618] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 22:51:32,619] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 22:51:32,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,669] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default1]:[2022-10-07 22:51:32,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default2]:[2022-10-07 22:51:32,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default2]:[2022-10-07 22:51:32,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,741] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 22:51:32,742] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 22:51:32,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default2]:[2022-10-07 22:51:32,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_03_model_states.pt. +[default3]:[2022-10-07 22:51:32,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,742] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 22:51:32,742] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default2]:[2022-10-07 22:51:32,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,850] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 22:51:32,850] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 22:51:32,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:32,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,850] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 22:51:32,851] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 22:51:32,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default4]:[2022-10-07 22:51:32,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:32,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:32,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:32,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:32,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:32,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default2]:[2022-10-07 22:51:33,028] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/globa[default1]:[2022-10-07 22:51:32,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:32,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +l_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoint[default1]:[2022-10-07 22:51:32,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +s/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 22:51:33,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,028] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmi[default1]:[2022-10-07 22:51:33,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +xnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default3]:[2022-10-07 22:51:32,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:32,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:32,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:32,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_01-model_states.pt. +[default0]:[2022-10-07 22:51:32,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:32,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:32,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:32,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:32,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:32,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:32,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default5]:[2022-10-07 22:51:32,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:32,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:32,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,028] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 22:51:33,028] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 22:51:32,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:32,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:32,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default2]:[2022-10-07 22:51:32,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:32,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:32,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:32,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:32,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:32,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:32,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:33,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-07 22:51:33,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default4]:[2022-10-07 22:51:33,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default2]:[2022-10-07 22:51:33,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:33,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_01_model_states.pt. +[default7]:[2022-10-07 22:51:33,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default2]:[2022-10-07 22:51:33,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default6]:[2022-10-07 22:51:33,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default4]:[2022-10-07 22:51:33,401] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-07 22:51:33,401] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-07 22:51:33,428] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-07 22:51:33,428] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-07 22:51:33,432] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoint[default1]:[2022-10-07 22:51:33,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:33,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,399] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 22:51:33,400] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-07 22:51:33,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,427] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default2]:[2022-10-07 22:51:33,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,428] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default2]:[2022-10-07 22:51:33,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,395] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-07 22:51:33,396] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-07 22:51:33,432] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-07 22:51:33,432] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +s/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-07 22:51:33,433] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default6]:[2022-10-07 22:51:33,396] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-07 22:51:33,397] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default1]:[2022-10-07 22:51:33,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default0]:[2022-10-07 22:51:33,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_01-model_states.pt. +[default6]:[2022-10-07 22:51:33,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default3]:[2022-10-07 22:51:33,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-07 22:51:33,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default3]:[2022-10-07 22:51:33,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default2]:[2022-10-07 22:51:33,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,577] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default3]:[2022-10-07 22:51:33,578] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 22:51:33,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default1]:[2022-10-07 22:51:33,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-07 22:51:33,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default1]:[2022-10-07 22:51:33,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-07 22:51:33,577] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default3]:[2022-10-07 22:51:33,577] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default2]:[2022-10-07 22:51:33,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-07 22:51:33,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-07 22:51:33,655] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-07 22:51:33,655] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-07 22:51:33,683] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default1]:[2022-10-07 22:51:33,683] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default2]:[2022-10-07 22:51:33,655] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default2]:[2022-10-07 22:51:33,656] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 22:51:33,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 22:51:33,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 22:51:33,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-07 22:51:33,683] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default1]:[2022-10-07 22:51:33,683] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-07 22:51:33,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:33,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,817] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default4]:[2022-10-07 22:51:33,817] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 22:51:33,753] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 22:51:33,754] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default0]:[2022-10-07 22:51:33,753] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-07 22:51:33,753] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default0]:could not find arguments in the checkpoint ... +[default0]: checkpoint version 3.0 +[default5]:[2022-10-07 22:51:33,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_01-model_states.pt. +[default6]:[2022-10-07 22:51:33,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-07 22:51:33,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-07 22:51:33,816] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-07 22:51:33,816] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-07 22:51:33,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default5]:[2022-10-07 22:51:33,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-07 22:51:33,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default5]:[2022-10-07 22:51:33,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-07 22:51:33,995] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-07 22:51:33,996] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default5]:[2022-10-07 22:51:33,994] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default5]:[2022-10-07 22:51:33,995] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-07 22:51:33,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:33,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:33,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:33,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:33,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:34,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:34,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:34,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:34,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt... +[default6]:[2022-10-07 22:51:33,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:33,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:33,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:34,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:34,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-07 22:51:34,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-07 22:51:34,068] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default6]:[2022-10-07 22:51:34,069] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt'] +[default7]:[2022-10-07 22:51:34,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:34,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:34,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:34,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:34,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_01-model_states.pt. +[default7]:[2022-10-07 22:51:34,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-07 22:51:34,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-07 22:51:34,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt... +[default7]:[2022-10-07 22:51:34,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_01-model_states.pt. +[default6]:[2022-10-07 22:51:34,068] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-07 22:51:34,068] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-07 22:51:34,174] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:[2022-10-07 22:51:34,174] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt'] +[default7]:time (ms) | load-checkpoint: 7144.17 +[default0]: successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq at iteration 0 +[default7]:[2022-10-07 22:51:34,173] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default7]:[2022-10-07 22:51:34,173] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt'] +[default0]:/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/utils.py:365: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +[default0]: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +[default0]:estimated model parameters: 2.236514304 +[default0]:estimated model parameters without embeddings: 1.208909824 +[default0]:[after model, optimizer, and learning rate scheduler are built] datetime: 2022-10-07 22:51:34 +[default0]:> building train, validation, and test datasets ... +[default0]: > datasets target sizes (minimum size): +[default0]: train: 6348800 +[default0]: validation: 102400 +[default0]: test: 4096 +[default0]:> building train, validation, and test datasets for T0 ... +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.135762 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 29920425) total of 29920425 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.058564 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002882 seconds +[default0]: number of documents: 31495184 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.089 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.105791 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4893782) total of 4893782 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002384 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002232 seconds +[default0]: number of documents: 5151349 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.046 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.027754 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3384633) total of 3384633 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.065956 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002764 seconds +[default0]: number of documents: 3562772 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.022 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046576 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2572338) total of 2572338 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.125233 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002931 seconds +[default0]: number of documents: 2707724 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.015 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045833 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4803145) total of 4803145 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048433 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002426 seconds +[default0]: number of documents: 5055942 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.015 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.059427 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2041507) total of 2041507 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.056471 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002302 seconds +[default0]: number of documents: 2148955 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.014 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.082033 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2496022) total of 2496022 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.021374 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003201 seconds +[default0]: number of documents: 2627392 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.041854 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3382528) total of 3382528 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.110075 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002605 seconds +[default0]: number of documents: 3560556 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.040 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.022101 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1466269) total of 1466269 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045393 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003256 seconds +[default0]: number of documents: 1543441 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.042 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.027901 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1583941) total of 1583941 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044573 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004491 seconds +[default0]: number of documents: 1667306 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.014 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.061393 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 812968) total of 812968 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.074215 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002994 seconds +[default0]: number of documents: 855756 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.076201 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 544696) total of 544696 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045030 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001712 seconds +[default0]: number of documents: 573364 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.008 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.041086 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 390101) total of 390101 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.037322 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001232 seconds +[default0]: number of documents: 410633 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.039510 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 407401) total of 407401 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.042937 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000811 seconds +[default0]: number of documents: 428843 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.041179 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 396406) total of 396406 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033292 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000941 seconds +[default0]: number of documents: 417269 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051243 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1058732) total of 1058732 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.057702 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002956 seconds +[default0]: number of documents: 1114455 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036420 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 330124) total of 330124 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036970 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000678 seconds +[default0]: number of documents: 347499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051521 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 322250) total of 322250 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045940 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000744 seconds +[default0]: number of documents: 339210 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.035845 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 299966) total of 299966 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033176 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001053 seconds +[default0]: number of documents: 315754 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.018513 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 872495) total of 872495 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038998 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003095 seconds +[default0]: number of documents: 918416 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.037 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.055613 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 902592) total of 902592 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045814 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003719 seconds +[default0]: number of documents: 950097 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.043661 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869310) total of 869310 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.060186 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003071 seconds +[default0]: number of documents: 915063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049126 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869308) total of 869308 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051271 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002414 seconds +[default0]: number of documents: 915061 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038865 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869305) total of 869305 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048254 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003352 seconds +[default0]: number of documents: 915058 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.009 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.043334 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 821803) total of 821803 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.037147 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003037 seconds +[default0]: number of documents: 865056 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045842 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869292) total of 869292 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046476 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003209 seconds +[default0]: number of documents: 915044 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048160 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869291) total of 869291 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046732 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002980 seconds +[default0]: number of documents: 915043 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051803 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869270) total of 869270 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.042178 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002900 seconds +[default0]: number of documents: 915021 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046381 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869301) total of 869301 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046164 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003172 seconds +[default0]: number of documents: 915054 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.032 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046618 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869298) total of 869298 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.047121 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003181 seconds +[default0]: number of documents: 915051 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033455 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 302280) total of 302280 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.035167 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000575 seconds +[default0]: number of documents: 318189 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.065348 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 252571) total of 252571 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036825 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000650 seconds +[default0]: number of documents: 265864 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.035374 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038133 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000619 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.031 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032312 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032321 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000620 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.065852 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045063 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000849 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036335 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346807) total of 346807 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004778 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001071 seconds +[default0]: number of documents: 365060 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004994 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346810) total of 346810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.030929 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000729 seconds +[default0]: number of documents: 365063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.034 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032622 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.031921 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001043 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.024894 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033353 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000866 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.031762 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004141 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000731 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.034970 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.029671 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000949 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.030375 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 257631) total of 257631 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038649 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001096 seconds +[default0]: number of documents: 271191 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.034002 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 256474) total of 256474 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.028805 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000613 seconds +[default0]: number of documents: 269973 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.030715 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.042037 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000800 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003158 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.030750 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000619 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.030773 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033888 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000628 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387164 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786753 +[default0]: dataset 2, input: 0.0636898, achieved: 0.0636898 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584986 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576332 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485994 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478619 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476917 +[default0]: dataset 8, input: 0.045653, achieved: 0.045653 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322254 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199319 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138497 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960602 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865244 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692258 +[default0]: dataset 15, input: 0.00582803, achieved: 0.00582806 +[default0]: dataset 16, input: 0.00582586, achieved: 0.00582586 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543682 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409057 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366564 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337937 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282753 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00274012 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264622 +[default0]: dataset 24, input: 0.00262358, achieved: 0.0026236 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00260032 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259097 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245155 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244736 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238686 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200525 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181879 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171917 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167776 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162355 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131238 +[default0]: dataset 36, input: 0.00127347, achieved: 0.00127344 +[default0]: dataset 37, input: 0.00120564, achieved: 0.00120569 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119529 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118536 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117487 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00114929 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112334 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112315 +[default0]: dataset 44, input: 0.00111237, achieved: 0.00111236 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110444 +[default0]:> elapsed time for building blendable dataset indices: 0.52 (sec) +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009104 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [29920425, 31495184) total of 1574759 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002260 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002774 seconds +[default0]: number of documents: 31495184 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_39844ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_39844ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.009 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002712 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4893782, 5151349) total of 257567 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008012 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003242 seconds +[default0]: number of documents: 5151349 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_8097ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_8097ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009733 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3384633, 3562772) total of 178139 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003344 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003237 seconds +[default0]: number of documents: 3562772 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_6555ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_6555ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003637 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2572338, 2707724) total of 135386 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007405 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009500 seconds +[default0]: number of documents: 2707724 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_6021ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_6021ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003826 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4803145, 5055942) total of 252797 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005278 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003104 seconds +[default0]: number of documents: 5055942 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_5932ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_5932ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002821 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2041507, 2148955) total of 107448 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005207 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003053 seconds +[default0]: number of documents: 2148955 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_5002ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_5002ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004400 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2496022, 2627392) total of 131370 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005908 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002582 seconds +[default0]: number of documents: 2627392 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_4926ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_4926ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003801 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3382528, 3560556) total of 178028 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005171 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003016 seconds +[default0]: number of documents: 3560556 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_4909ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_4909ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003618 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1466269, 1543441) total of 77172 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004607 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002307 seconds +[default0]: number of documents: 1543441 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_4699ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_4699ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002564 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1583941, 1667306) total of 83365 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004751 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002391 seconds +[default0]: number of documents: 1667306 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_3317ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_3317ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003826 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [812968, 855756) total of 42788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006171 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003334 seconds +[default0]: number of documents: 855756 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_2052ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_2052ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.015 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001765 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [544696, 573364) total of 28668 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002105 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002699 seconds +[default0]: number of documents: 573364 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_1426ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_1426ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.024 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001839 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [390101, 410633) total of 20532 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002319 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001179 seconds +[default0]: number of documents: 410633 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_989ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_989ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.045 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002002 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [407401, 428843) total of 21442 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001674 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001162 seconds +[default0]: number of documents: 428843 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_891ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_891ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.014 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001560 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [396406, 417269) total of 20863 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001402 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001595 seconds +[default0]: number of documents: 417269 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_713ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_713ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003769 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1058732, 1114455) total of 55723 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004658 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002294 seconds +[default0]: number of documents: 1114455 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_600ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_600ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002352 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [330124, 347499) total of 17375 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002167 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001039 seconds +[default0]: number of documents: 347499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_600ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_600ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001114 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [322250, 339210) total of 16960 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001033 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001928 seconds +[default0]: number of documents: 339210 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_560ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_560ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.016 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001032 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [299966, 315754) total of 15788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000880 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001138 seconds +[default0]: number of documents: 315754 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_421ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_421ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.017 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003487 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [872495, 918416) total of 45921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003621 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003210 seconds +[default0]: number of documents: 918416 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_378ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_378ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.032 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003272 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [902592, 950097) total of 47505 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005053 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003011 seconds +[default0]: number of documents: 950097 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_348ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_348ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004658 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869310, 915063) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005973 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003372 seconds +[default0]: number of documents: 915063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_291ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_291ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003479 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869308, 915061) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005172 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003040 seconds +[default0]: number of documents: 915061 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_282ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_282ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002976 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869305, 915058) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005694 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003637 seconds +[default0]: number of documents: 915058 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_273ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_273ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003834 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [821803, 865056) total of 43253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005275 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003041 seconds +[default0]: number of documents: 865056 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_270ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_270ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003535 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869292, 915044) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005203 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002608 seconds +[default0]: number of documents: 915044 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_268ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_268ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004352 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869291, 915043) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005675 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002863 seconds +[default0]: number of documents: 915043 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_267ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_267ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003643 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869270, 915021) total of 45751 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006154 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003160 seconds +[default0]: number of documents: 915021 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_253ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_253ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004162 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869301, 915054) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005791 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002602 seconds +[default0]: number of documents: 915054 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_252ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_252ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004467 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869298, 915051) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003483 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002685 seconds +[default0]: number of documents: 915051 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_246ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_246ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.018 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001533 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [302280, 318189) total of 15909 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001747 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000902 seconds +[default0]: number of documents: 318189 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_207ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_207ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001236 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [252571, 265864) total of 13293 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001205 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000930 seconds +[default0]: number of documents: 265864 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_188ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_188ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001258 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001151 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000863 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_177ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_177ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001069 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000884 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000722 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_173ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_173ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.017 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001226 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001131 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000844 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_168ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_168ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001498 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346807, 365060) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001567 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001071 seconds +[default0]: number of documents: 365060 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_136ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_136ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.013 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001841 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346810, 365063) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002640 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001158 seconds +[default0]: number of documents: 365063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_132ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_132ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001173 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001151 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000873 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_125ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_125ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001246 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000962 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000829 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_124ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_124ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001078 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001323 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000728 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_122ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_122ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.032 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001079 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000974 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000863 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_121ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_121ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001278 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [257631, 271191) total of 13560 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001077 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000772 seconds +[default0]: number of documents: 271191 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_119ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_119ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001258 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [256474, 269973) total of 13499 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000886 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000649 seconds +[default0]: number of documents: 269973 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_116ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_116ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.017 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001290 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001158 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000911 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_116ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_116ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001144 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000843 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000677 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_115ns_42s_decoder_packed_batch_idx.npy +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_115ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001272 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001135 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000829 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_114ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_114ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387164 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786748 +[default0]: dataset 2, input: 0.0636898, achieved: 0.06369 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584988 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576333 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485993 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478607 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476915 +[default0]: dataset 8, input: 0.045653, achieved: 0.0456527 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322258 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199317 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138501 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960523 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865317 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692215 +[default0]: dataset 15, input: 0.00582803, achieved: 0.00582776 +[default0]: dataset 16, input: 0.00582586, achieved: 0.00582584 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543732 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409097 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366591 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337933 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282733 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00274078 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264654 +[default0]: dataset 24, input: 0.00262358, achieved: 0.00262346 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00260038 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259076 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245228 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244651 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238688 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200606 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181949 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171948 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167716 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162331 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131173 +[default0]: dataset 36, input: 0.00127347, achieved: 0.00127326 +[default0]: dataset 37, input: 0.00120564, achieved: 0.00120594 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119633 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118479 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117517 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00115017 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112324 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112324 +[default0]: dataset 44, input: 0.00111237, achieved: 0.0011117 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110401 +[default0]:> elapsed time for building blendable dataset indices: 0.03 (sec) +[default0]:> finished creating T0 datasets ... +[default0]:GOTCONSUMEDSAMPLES 0 0 +[default4]:GOTCONSUMEDSAMPLES 0 0 +[default2]:GOTCONSUMEDSAMPLES 0 0 +[default6]:GOTCONSUMEDSAMPLES 0 0 +[default1]:[001-001] 2.2365B / 1.2089B +[default7]:time (ms) | model-and-optimizer-setup: 15073.13 | train/valid/test-data-iterators-setup: 8514.20 +[default0]:[000-001] 2.2365B / 1.2089B +[default1]:[001-000] 2.2365B / 1.2089B +[default0]:[after dataloaders are built] datetime: 2022-10-07 22:51:43 +[default0]:done with setup ... +[default0]:training ... +[default0]:Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +[default0]:[000-000] 2.2365B / 1.2089B +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default5]: iteration = train(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default5]: train_step(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default5]: loss = model[0].train_batch(data_iter=data_iterator) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default5]: self._exec_schedule(sched) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default5]: self._exec_instr(**cmd.kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default5]: batch = self._next_batch() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default5]: batch = self.batch_fn(batch) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default5]: if 'text' in data: +[default5]:TypeError: argument of type 'NoneType' is not iterable +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default5]: iteration = train(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default5]: train_step(forward_step_func, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default5]: loss = model[0].train_batch(data_iter=data_iterator) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default5]: self._exec_schedule(sched) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default5]: self._exec_instr(**cmd.kwargs) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default5]: batch = self._next_batch() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default5]: batch = self.batch_fn(batch) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default5]: if 'text' in data: +[default5]:TypeError: argument of type 'NoneType' is not iterable +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default7]: iteration = train(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default7]: train_step(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default7]: loss = model[0].train_batch(data_iter=data_iterator) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default7]: self._exec_schedule(sched) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default7]: self._exec_instr(**cmd.kwargs) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default7]: batch = self._next_batch() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default7]: batch = self.batch_fn(batch) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default7]: if 'text' in data: +[default7]:TypeError: argument of type 'NoneType' is not iterable +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default7]: iteration = train(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default7]: train_step(forward_step_func, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default7]: loss = model[0].train_batch(data_iter=data_iterator) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default7]: self._exec_schedule(sched) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default7]: self._exec_instr(**cmd.kwargs) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default7]: batch = self._next_batch() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default7]: batch = self.batch_fn(batch) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default7]: if 'text' in data: +[default7]:TypeError: argument of type 'NoneType' is not iterable +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default3]: iteration = train(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default3]: train_step(forward_step_func, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default3]: loss = model[0].train_batch(data_iter=data_iterator) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default3]: self._exec_schedule(sched) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default3]: self._exec_instr(**cmd.kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default3]: batch = self._next_batch() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default3]: batch = self.batch_fn(batch) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default3]: if 'text' in data: +[default3]:TypeError: argument of type 'NoneType' is not iterable +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +[default0]:[before the start of training step] datetime: 2022-10-07 22:51:43 +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain +[default1]: iteration = train(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train +[default1]: train_step(forward_step_func, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step +[default1]: loss = model[0].train_batch(data_iter=data_iterator) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch +[default1]: self._exec_schedule(sched) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule +[default1]: self._exec_instr(**cmd.kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch +[default1]: batch = self._next_batch() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch +[default1]: batch = self.batch_fn(batch) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe +[default1]: if 'text' in data: +[default1]:TypeError: argument of type 'NoneType' is not iterable +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 883143 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 883145 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 883147 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 883149 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 878807 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 878809 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 878811 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 878813 closing signal SIGTERM +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 883144) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 878808) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 319.8397192955017 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-07_22:51:43 + host : jean-zay-iam34-ib0 + rank : 11 (local_rank: 3) + exitcode : 1 (pid: 883146) + error_file: /tmp/torchelastic_akm6vbq1/none_3cdtkui1/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +[2]: + time : 2022-10-07_22:51:43 + host : jean-zay-iam34-ib0 + rank : 13 (local_rank: 5) + exitcode : 1 (pid: 883148) + error_file: /tmp/torchelastic_akm6vbq1/none_3cdtkui1/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +[3]: + time : 2022-10-07_22:51:43 + host : jean-zay-iam34-ib0 + rank : 15 (local_rank: 7) + exitcode : 1 (pid: 883150) + error_file: /tmp/torchelastic_akm6vbq1/none_3cdtkui1/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-07_22:51:43 + host : jean-zay-iam34-ib0 + rank : 9 (local_rank: 1) + exitcode : 1 (pid: 883144) + error_file: /tmp/torchelastic_akm6vbq1/none_3cdtkui1/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 188, in pretrain + iteration = train(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 930, in train + train_step(forward_step_func, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 510, in train_step + loss = model[0].train_batch(data_iter=data_iterator) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 345, in train_batch + self._exec_schedule(sched) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1375, in _exec_schedule + self._exec_instr(**cmd.kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 789, in _exec_load_micro_batch + batch = self._next_batch() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 625, in _next_batch + batch = self.batch_fn(batch) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 70, in get_batch_pipe + if 'text' in data: + TypeError: argument of type 'NoneType' is not iterable + +============================================================ +srun: error: jean-zay-iam34: task 1: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2114997.0 +slurmstepd: error: *** STEP 2114997.0 ON jean-zay-iam04 CANCELLED AT 2022-10-07T22:57:08 *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 337624 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 337625 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 333264 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 333265 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 337626 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 333266 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 337627 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 333267 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 333268 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 337628 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 337629 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 333269 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 333270 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 337630 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 337631 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 333271 closing signal SIGTERM +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run + self._exit_barrier() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 878767 got signal: 15 +srun: error: jean-zay-iam04: task 0: Exited with exit code 1 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 337585 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 333226 got signal: 15 +srun: error: jean-zay-iam35: task 2: Exited with exit code 1 +srun: error: jean-zay-iam36: task 3: Exited with exit code 1 +/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) + return torch._C._cuda_getDeviceCount() > 0 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default0]: return torch._C._cuda_getDeviceCount() > 0 +[default1]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default1]: return torch._C._cuda_getDeviceCount() > 0 +[default2]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default2]: return torch._C._cuda_getDeviceCount() > 0 +[default3]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default3]: return torch._C._cuda_getDeviceCount() > 0 +[default4]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default4]: return torch._C._cuda_getDeviceCount() > 0 +[default7]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default7]: return torch._C._cuda_getDeviceCount() > 0 +[default6]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default6]: return torch._C._cuda_getDeviceCount() > 0 +[default5]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default5]: return torch._C._cuda_getDeviceCount() > 0 +[default0]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default1]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default4]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default2]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default3]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default7]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default6]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default0]:Traceback (most recent call last): +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default2]:AssertionError: Megatron requires CUDA. +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default0]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default1]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default1]:AssertionError: Megatron requires CUDA. +[default0]:AssertionError: Megatron requires CUDA. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default3]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default3]:AssertionError: Megatron requires CUDA. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default4]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default4]:AssertionError: Megatron requires CUDA. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default7]:AssertionError: Megatron requires CUDA. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default6]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default6]:AssertionError: Megatron requires CUDA. +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default5]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default5]:AssertionError: Megatron requires CUDA. +[default0]:using world size: 32, data-parallel-size: 8, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 8 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2170781.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 2 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 8192 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 2048 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 128 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 32 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 256 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-08 17:12:45,096] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default7]:> setting tensorboard ... +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2714099) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 317.83678245544434 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-08_17:12:43 + host : jean-zay-iam50-ib0 + rank : 9 (local_rank: 1) + exitcode : 1 (pid: 2714100) + error_file: /tmp/torchelastic_8e9uv7mp/none_1k8cdyoj/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[2]: + time : 2022-10-08_17:12:43 + host : jean-zay-iam50-ib0 + rank : 10 (local_rank: 2) + exitcode : 1 (pid: 2714101) + error_file: /tmp/torchelastic_8e9uv7mp/none_1k8cdyoj/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[3]: + time : 2022-10-08_17:12:43 + host : jean-zay-iam50-ib0 + rank : 11 (local_rank: 3) + exitcode : 1 (pid: 2714102) + error_file: /tmp/torchelastic_8e9uv7mp/none_1k8cdyoj/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[4]: + time : 2022-10-08_17:12:43 + host : jean-zay-iam50-ib0 + rank : 12 (local_rank: 4) + exitcode : 1 (pid: 2714103) + error_file: /tmp/torchelastic_8e9uv7mp/none_1k8cdyoj/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[5]: + time : 2022-10-08_17:12:43 + host : jean-zay-iam50-ib0 + rank : 13 (local_rank: 5) + exitcode : 1 (pid: 2714104) + error_file: /tmp/torchelastic_8e9uv7mp/none_1k8cdyoj/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[6]: + time : 2022-10-08_17:12:43 + host : jean-zay-iam50-ib0 + rank : 14 (local_rank: 6) + exitcode : 1 (pid: 2714105) + error_file: /tmp/torchelastic_8e9uv7mp/none_1k8cdyoj/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[7]: + time : 2022-10-08_17:12:43 + host : jean-zay-iam50-ib0 + rank : 15 (local_rank: 7) + exitcode : 1 (pid: 2714106) + error_file: /tmp/torchelastic_8e9uv7mp/none_1k8cdyoj/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-08_17:12:43 + host : jean-zay-iam50-ib0 + rank : 8 (local_rank: 0) + exitcode : 1 (pid: 2714099) + error_file: /tmp/torchelastic_8e9uv7mp/none_1k8cdyoj/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +============================================================ +srun: error: jean-zay-iam50: task 1: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2170781.0 +slurmstepd: error: *** STEP 2170781.0 ON jean-zay-iam49 CANCELLED AT 2022-10-08T17:18:08 *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3638514 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3638515 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3638516 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591372 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591373 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3731691 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3638517 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3731692 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3638518 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591374 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3638519 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591375 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3638520 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3638521 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3731693 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3731694 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3731695 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591376 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3731696 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591377 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3731697 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591378 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591379 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3731698 closing signal SIGTERM +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3638474 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3731651 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 2591333 got signal: 15 +srun: error: jean-zay-iam49: task 0: Exited with exit code 1 +srun: error: jean-zay-iam51: task 2: Exited with exit code 1 +srun: error: jean-zay-iam52: task 3: Exited with exit code 1 +/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) + return torch._C._cuda_getDeviceCount() > 0 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default0]: return torch._C._cuda_getDeviceCount() > 0 +[default1]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default1]: return torch._C._cuda_getDeviceCount() > 0 +[default4]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default4]: return torch._C._cuda_getDeviceCount() > 0 +[default6]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default6]: return torch._C._cuda_getDeviceCount() > 0 +[default3]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default3]: return torch._C._cuda_getDeviceCount() > 0 +[default5]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default5]: return torch._C._cuda_getDeviceCount() > 0 +[default2]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default2]: return torch._C._cuda_getDeviceCount() > 0 +[default7]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default7]: return torch._C._cuda_getDeviceCount() > 0 +[default0]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default6]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default1]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default4]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default3]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default7]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default0]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default0]:AssertionError: Megatron requires CUDA. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default1]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default1]:AssertionError: Megatron requires CUDA. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default4]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default4]:AssertionError: Megatron requires CUDA. +[default2]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default6]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default6]:AssertionError: Megatron requires CUDA. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default3]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default3]:AssertionError: Megatron requires CUDA. +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default5]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default5]:AssertionError: Megatron requires CUDA. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default2]:AssertionError: Megatron requires CUDA. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default7]:AssertionError: Megatron requires CUDA. +[default0]:using world size: 32, data-parallel-size: 8, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 8 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2171355.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 2 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 8192 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 2048 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 128 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 32 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 256 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default7]:> setting tensorboard ... +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-08 17:23:58,476] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2714384) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 332.4156482219696 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-08_17:23:55 + host : jean-zay-iam50-ib0 + rank : 9 (local_rank: 1) + exitcode : 1 (pid: 2714385) + error_file: /tmp/torchelastic__26eayqs/none_q8sonw_0/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[2]: + time : 2022-10-08_17:23:56 + host : jean-zay-iam50-ib0 + rank : 10 (local_rank: 2) + exitcode : 1 (pid: 2714386) + error_file: /tmp/torchelastic__26eayqs/none_q8sonw_0/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[3]: + time : 2022-10-08_17:23:55 + host : jean-zay-iam50-ib0 + rank : 11 (local_rank: 3) + exitcode : 1 (pid: 2714387) + error_file: /tmp/torchelastic__26eayqs/none_q8sonw_0/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[4]: + time : 2022-10-08_17:23:55 + host : jean-zay-iam50-ib0 + rank : 12 (local_rank: 4) + exitcode : 1 (pid: 2714388) + error_file: /tmp/torchelastic__26eayqs/none_q8sonw_0/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[5]: + time : 2022-10-08_17:23:55 + host : jean-zay-iam50-ib0 + rank : 13 (local_rank: 5) + exitcode : 1 (pid: 2714389) + error_file: /tmp/torchelastic__26eayqs/none_q8sonw_0/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[6]: + time : 2022-10-08_17:23:55 + host : jean-zay-iam50-ib0 + rank : 14 (local_rank: 6) + exitcode : 1 (pid: 2714390) + error_file: /tmp/torchelastic__26eayqs/none_q8sonw_0/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[7]: + time : 2022-10-08_17:23:56 + host : jean-zay-iam50-ib0 + rank : 15 (local_rank: 7) + exitcode : 1 (pid: 2714391) + error_file: /tmp/torchelastic__26eayqs/none_q8sonw_0/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-08_17:23:55 + host : jean-zay-iam50-ib0 + rank : 8 (local_rank: 0) + exitcode : 1 (pid: 2714384) + error_file: /tmp/torchelastic__26eayqs/none_q8sonw_0/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +============================================================ +srun: error: jean-zay-iam50: task 1: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2171355.0 +slurmstepd: error: *** STEP 2171355.0 ON jean-zay-iam49 CANCELLED AT 2022-10-08T17:29:37 *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591975 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591976 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732292 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732293 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639211 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591977 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591978 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732294 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591979 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639212 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591980 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591981 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732295 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639213 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732296 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2591982 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639214 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732297 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639215 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732298 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639216 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732299 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639217 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639218 closing signal SIGTERM +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3732253 got signal: 15 + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 2591937 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3639171 got signal: 15 +srun: error: jean-zay-iam49: task 0: Exited with exit code 1 +srun: error: jean-zay-iam52: task 3: Exited with exit code 1 +srun: error: jean-zay-iam51: task 2: Exited with exit code 1 +/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) + return torch._C._cuda_getDeviceCount() > 0 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default1]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default1]: return torch._C._cuda_getDeviceCount() > 0 +[default5]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default5]: return torch._C._cuda_getDeviceCount() > 0 +[default3]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default3]: return torch._C._cuda_getDeviceCount() > 0 +[default4]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default6]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default6]: return torch._C._cuda_getDeviceCount() > 0 +[default4]: return torch._C._cuda_getDeviceCount() > 0 +[default7]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default7]: return torch._C._cuda_getDeviceCount() > 0 +[default0]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default0]: return torch._C._cuda_getDeviceCount() > 0 +[default2]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default2]: return torch._C._cuda_getDeviceCount() > 0 +[default3]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default1]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default4]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default7]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default6]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default5]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default5]:AssertionError: Megatron requires CUDA. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default1]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default1]:AssertionError: Megatron requires CUDA. +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default4]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default4]:AssertionError: Megatron requires CUDA. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default3]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default3]:AssertionError: Megatron requires CUDA. +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default6]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default6]:AssertionError: Megatron requires CUDA. +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default7]:AssertionError: Megatron requires CUDA. +[default0]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default2]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default2]:AssertionError: Megatron requires CUDA. +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2714676 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2714677 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2714678 closing signal SIGTERM +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 3 (pid: 2714679) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:using world size: 32, data-parallel-size: 8, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 8 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2171913.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 2 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 8192 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 2048 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 128 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 2 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 2 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 32 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 256 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default7]:> setting tensorboard ... +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-08 17:35:18,279] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592650 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732895 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732896 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592651 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592652 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732897 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732898 closing signal SIGTERM +slurmstepd: error: *** STEP 2171913.0 ON jean-zay-iam49 CANCELLED AT 2022-10-08T17:36:09 *** +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592653 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592654 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592655 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732899 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732900 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732901 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3732902 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592656 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592657 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639941 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639942 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639943 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639944 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639945 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639946 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639947 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3639948 closing signal SIGTERM +srun: Job step aborted: Waiting up to 62 seconds for job step to finish. +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3732855 got signal: 15 +/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) + return torch._C._cuda_getDeviceCount() > 0 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default0]: return torch._C._cuda_getDeviceCount() > 0 +[default1]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default1]: return torch._C._cuda_getDeviceCount() > 0 +[default2]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default2]: return torch._C._cuda_getDeviceCount() > 0 +[default3]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default3]: return torch._C._cuda_getDeviceCount() > 0 +[default4]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default4]: return torch._C._cuda_getDeviceCount() > 0 +[default5]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default5]: return torch._C._cuda_getDeviceCount() > 0 +[default6]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default6]: return torch._C._cuda_getDeviceCount() > 0 +[default7]:/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 802: system not yet initialized (Triggered internally at /opt/conda/conda-bld/pytorch_1656352465323/work/c10/cuda/CUDAFunctions.cpp:109.) +[default7]: return torch._C._cuda_getDeviceCount() > 0 +[default0]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default1]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default2]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default3]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default6]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default4]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default5]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default0]:Traceback (most recent call last): +[default7]:No CUDA runtime is found, using CUDA_HOME='/gpfslocalsys/cuda/11.4.3' +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default0]: initialize_megatron(extra_args_provider=extra_args_provider, +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default0]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default0]:AssertionError: Megatron requires CUDA. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default1]: initialize_megatron(extra_args_provider=extra_args_provider, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default1]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default1]:AssertionError: Megatron requires CUDA. +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default2]: initialize_megatron(extra_args_provider=extra_args_provider, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default2]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default2]:AssertionError: Megatron requires CUDA. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default3]: initialize_megatron(extra_args_provider=extra_args_provider, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default4]: initialize_megatron(extra_args_provider=extra_args_provider, +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default4]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default4]:AssertionError: Megatron requires CUDA. +[default3]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default3]:AssertionError: Megatron requires CUDA. +[default5]:Traceback (most recent call last): +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: initialize_megatron(extra_args_provider=extra_args_provider, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default6]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default6]:AssertionError: Megatron requires CUDA. +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain +[default7]: initialize_megatron(extra_args_provider=extra_args_provider, +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default5]: initialize_megatron(extra_args_provider=extra_args_provider, +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron +[default7]:AssertionError: Megatron requires CUDA. +[default5]: assert torch.cuda.is_available(), 'Megatron requires CUDA.' +[default5]:AssertionError: Megatron requires CUDA. +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2714958) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +[default0]:using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 64 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2172033.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 2 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 8192 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 2048 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 2048 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/kill-switch-tr13b-1B3-mtf +[default0]: kv_channels ..................................... 128 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 1 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 1 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13b-1B3-ml-t0/tr13b-1B3-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 64 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 32 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default7]:> setting tensorboard ... +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-08 17:37:35,594] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 305.79719614982605 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-08_17:37:32 + host : jean-zay-iam50-ib0 + rank : 41 (local_rank: 1) + exitcode : 1 (pid: 2714959) + error_file: /tmp/torchelastic_fge226e5/none_gb2m9e_l/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[2]: + time : 2022-10-08_17:37:32 + host : jean-zay-iam50-ib0 + rank : 42 (local_rank: 2) + exitcode : 1 (pid: 2714960) + error_file: /tmp/torchelastic_fge226e5/none_gb2m9e_l/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[3]: + time : 2022-10-08_17:37:32 + host : jean-zay-iam50-ib0 + rank : 43 (local_rank: 3) + exitcode : 1 (pid: 2714961) + error_file: /tmp/torchelastic_fge226e5/none_gb2m9e_l/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[4]: + time : 2022-10-08_17:37:32 + host : jean-zay-iam50-ib0 + rank : 44 (local_rank: 4) + exitcode : 1 (pid: 2714962) + error_file: /tmp/torchelastic_fge226e5/none_gb2m9e_l/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[5]: + time : 2022-10-08_17:37:32 + host : jean-zay-iam50-ib0 + rank : 45 (local_rank: 5) + exitcode : 1 (pid: 2714963) + error_file: /tmp/torchelastic_fge226e5/none_gb2m9e_l/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[6]: + time : 2022-10-08_17:37:32 + host : jean-zay-iam50-ib0 + rank : 46 (local_rank: 6) + exitcode : 1 (pid: 2714964) + error_file: /tmp/torchelastic_fge226e5/none_gb2m9e_l/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +[7]: + time : 2022-10-08_17:37:32 + host : jean-zay-iam50-ib0 + rank : 47 (local_rank: 7) + exitcode : 1 (pid: 2714965) + error_file: /tmp/torchelastic_fge226e5/none_gb2m9e_l/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-08_17:37:32 + host : jean-zay-iam50-ib0 + rank : 40 (local_rank: 0) + exitcode : 1 (pid: 2714958) + error_file: /tmp/torchelastic_fge226e5/none_gb2m9e_l/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/training.py", line 99, in pretrain + initialize_megatron(extra_args_provider=extra_args_provider, + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew/Megatron-DeepSpeed/megatron/initialize.py", line 83, in initialize_megatron + assert torch.cuda.is_available(), 'Megatron requires CUDA.' + AssertionError: Megatron requires CUDA. + +============================================================ +srun: error: jean-zay-iam50: task 5: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2172033.0 +slurmstepd: error: *** STEP 2172033.0 ON jean-zay-iam10 CANCELLED AT 2022-10-08T17:42:40 *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3076963 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3907014 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3907015 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592941 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3076964 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592942 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3907016 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3076965 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3945215 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3907017 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3076966 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3945216 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3076967 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1172910 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3076968 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3640247 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3733272 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3076969 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592943 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3640248 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3076970 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3733273 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592944 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3907018 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3945217 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592945 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3907019 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3945218 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3945219 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3733274 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592946 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3945220 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3640249 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1172911 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592947 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 2592948 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3945221 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1172912 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3733275 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3640250 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3945222 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1172913 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3733276 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3640251 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3907020 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3733277 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3907021 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3733278 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3640252 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3640253 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3733279 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1172914 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1172915 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1172916 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3640254 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1172917 closing signal SIGTERM +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3733241 got signal: 15 + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3906983 got signal: 15 + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3640217 got signal: 15 + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 2592911 got signal: 15 + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 1172880 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3076931 got signal: 15 + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3945183 got signal: 15 +srun: error: jean-zay-iam51: task 6: Exited with exit code 1 +srun: error: jean-zay-iam49: task 4: Exited with exit code 1 +srun: error: jean-zay-iam31: task 1: Exited with exit code 1 +srun: error: jean-zay-iam10: task 0: Exited with exit code 1 +srun: error: jean-zay-iam52: task 7: Exited with exit code 1 +srun: error: jean-zay-iam44: task 2: Exited with exit code 1 +srun: error: jean-zay-iam45: task 3: Exited with exit code 1